aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-11-13 14:55:35 -0500
committerJiri Kosina <jkosina@suse.cz>2011-11-13 14:55:53 -0500
commit2290c0d06d82faee87b1ab2d9d4f7bf81ef64379 (patch)
treee075e4d5534193f28e6059904f61e5ca03958d3c /fs
parent4da669a2e3e5bc70b30a0465f3641528681b5f77 (diff)
parent52e4c2a05256cb83cda12f3c2137ab1533344edb (diff)
Merge branch 'master' into for-next
Sync with Linus tree to have 157550ff ("mtd: add GPMI-NAND driver in the config and Makefile") as I have patch depending on that one.
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c33
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c4
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/inode.c2
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/inode.c8
-rw-r--r--fs/affs/namei.c6
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/aio.c140
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c11
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c1
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h198
-rw-r--r--fs/btrfs/delayed-inode.c110
-rw-r--r--fs/btrfs/disk-io.c487
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c873
-rw-r--r--fs/btrfs/extent_io.c614
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/free-space-cache.c929
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c543
-rw-r--r--fs/btrfs/ioctl.c227
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/scrub.c655
-rw-r--r--fs/btrfs/super.c335
-rw-r--r--fs/btrfs/transaction.c148
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c212
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/ceph/addr.c193
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/dir.c87
-rw-r--r--fs/ceph/inode.c56
-rw-r--r--fs/ceph/ioctl.c34
-rw-r--r--fs/ceph/ioctl.h55
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/ceph/super.c65
-rw-r--r--fs/ceph/super.h42
-rw-r--r--fs/cifs/README14
-rw-r--r--fs/cifs/cifs_debug.c9
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifsacl.c347
-rw-r--r--fs/cifs/cifsencrypt.c113
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h55
-rw-r--r--fs/cifs/cifspdu.h48
-rw-r--r--fs/cifs/cifsproto.h56
-rw-r--r--fs/cifs/cifssmb.c454
-rw-r--r--fs/cifs/connect.c700
-rw-r--r--fs/cifs/dir.c22
-rw-r--r--fs/cifs/export.c4
-rw-r--r--fs/cifs/file.c1132
-rw-r--r--fs/cifs/inode.c60
-rw-r--r--fs/cifs/link.c19
-rw-r--r--fs/cifs/misc.c66
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smbencrypt.c184
-rw-r--r--fs/cifs/transport.c19
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/coda_linux.c2
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/compat.c15
-rw-r--r--fs/dcache.c46
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c646
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c12
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/eventpoll.c25
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/Kconfig9
-rw-r--r--fs/exofs/exofs.h26
-rw-r--r--fs/exofs/inode.c235
-rw-r--r--fs/exofs/ore.c657
-rw-r--r--fs/exofs/ore_raid.c660
-rw-r--r--fs/exofs/ore_raid.h79
-rw-r--r--fs/exofs/super.c206
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext3/balloc.c17
-rw-r--r--fs/ext3/fsync.c10
-rw-r--r--fs/ext3/ialloc.c47
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/ioctl.c24
-rw-r--r--fs/ext3/namei.c6
-rw-r--r--fs/ext3/super.c12
-rw-r--r--fs/ext4/balloc.c345
-rw-r--r--fs/ext4/ext4.h185
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/extents.c1168
-rw-r--r--fs/ext4/file.c51
-rw-r--r--fs/ext4/fsync.c10
-rw-r--r--fs/ext4/ialloc.c206
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inode.c522
-rw-r--r--fs/ext4/ioctl.c65
-rw-r--r--fs/ext4/mballoc.c331
-rw-r--r--fs/ext4/mballoc.h11
-rw-r--r--fs/ext4/migrate.c111
-rw-r--r--fs/ext4/mmp.c10
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c29
-rw-r--r--fs/ext4/page-io.c66
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c263
-rw-r--r--fs/ext4/xattr.c12
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h9
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c84
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c199
-rw-r--r--fs/gfs2/dir.c50
-rw-r--r--fs/gfs2/file.c299
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c91
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c112
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/lops.c66
-rw-r--r--fs/gfs2/ops_fstype.c7
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c573
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c134
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h22
-rw-r--r--fs/gfs2/xattr.c28
-rw-r--r--fs/hfs/btree.c20
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/inode.c10
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c10
-rw-r--r--fs/hpfs/namei.c8
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c4
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/inode.c14
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c44
-rw-r--r--fs/jbd2/recovery.c28
-rw-r--r--fs/jbd2/transaction.c68
-rw-r--r--fs/jffs2/compr.c128
-rw-r--r--fs/jffs2/compr.h2
-rw-r--r--fs/jffs2/dir.c6
-rw-r--r--fs/jffs2/fs.c8
-rw-r--r--fs/jffs2/jffs2_fs_sb.h6
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c119
-rw-r--r--fs/jffs2/wbuf.c9
-rw-r--r--fs/jfs/jfs_imap.c6
-rw-r--r--fs/jfs/jfs_inode.c2
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/namei.c12
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/lockd/host.c25
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c223
-rw-r--r--fs/logfs/dir.c8
-rw-r--r--fs/logfs/inode.c3
-rw-r--r--fs/logfs/logfs.h1
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/logfs/super.c23
-rw-r--r--fs/minix/inode.c4
-rw-r--r--fs/namei.c51
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c58
-rw-r--r--fs/nfs/blocklayout/blocklayout.h4
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c35
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/file.c19
-rw-r--r--fs/nfs/fscache-index.c4
-rw-r--r--fs/nfs/idmap.c25
-rw-r--r--fs/nfs/inode.c22
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/nfs4filelayout.c41
-rw-r--r--fs/nfs/nfs4proc.c99
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c78
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/read.c40
-rw-r--r--fs/nfs/super.c17
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c77
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c20
-rw-r--r--fs/nfsd/nfs4proc.c374
-rw-r--r--fs/nfsd/nfs4recover.c53
-rw-r--r--fs/nfsd/nfs4state.c1794
-rw-r--r--fs/nfsd/nfs4xdr.c380
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/nfsd.h33
-rw-r--r--fs/nfsd/nfsfh.c39
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/nfsd/state.h174
-rw-r--r--fs/nfsd/vfs.c31
-rw-r--r--fs/nfsd/vfs.h29
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h8
-rw-r--r--fs/ntfs/debug.h15
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ocfs2/cluster/tcp.c1
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/namei.c18
-rw-r--r--fs/ocfs2/super.h14
-rw-r--r--fs/open.c4
-rw-r--r--fs/openpromfs/inode.c4
-rw-r--r--fs/partitions/ldm.c16
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/base.c25
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/proc_sysctl.c48
-rw-r--r--fs/proc/stat.c41
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/pstore/inode.c40
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c93
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/quota/quota.c9
-rw-r--r--fs/ramfs/inode.c10
-rw-r--r--fs/read_write.c82
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/reiserfs/namei.c16
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/squashfs/Kconfig22
-rw-r--r--fs/squashfs/inode.c18
-rw-r--r--fs/squashfs/squashfs_fs.h7
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c5
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysfs/dir.c196
-rw-r--r--fs/sysfs/file.c56
-rw-r--r--fs/sysfs/inode.c16
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/debug.c16
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/xattr.c4
-rw-r--r--fs/udf/balloc.c14
-rw-r--r--fs/udf/directory.c8
-rw-r--r--fs/udf/inode.c56
-rw-r--r--fs/udf/lowlevel.c2
-rw-r--r--fs/udf/misc.c19
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/partition.c19
-rw-r--r--fs/udf/super.c280
-rw-r--r--fs/udf/truncate.c22
-rw-r--r--fs/udf/udf_sb.h5
-rw-r--r--fs/udf/udfdecl.h35
-rw-r--r--fs/udf/udftime.c3
-rw-r--r--fs/udf/unicode.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/ufs.h9
-rw-r--r--fs/xfs/xfs_alloc.c4
-rw-r--r--fs/xfs/xfs_aops.c127
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c89
-rw-r--r--fs/xfs/xfs_attr_leaf.c7
-rw-r--r--fs/xfs/xfs_bmap.c2531
-rw-r--r--fs/xfs/xfs_bmap.h318
-rw-r--r--fs/xfs/xfs_btree.c11
-rw-r--r--fs/xfs/xfs_buf.c244
-rw-r--r--fs/xfs/xfs_buf.h51
-rw-r--r--fs/xfs/xfs_buf_item.c14
-rw-r--r--fs/xfs/xfs_da_btree.c54
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_discard.c20
-rw-r--r--fs/xfs/xfs_dquot.c32
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_export.c12
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_file.c168
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_ialloc.c15
-rw-r--r--fs/xfs/xfs_iget.c2
-rw-r--r--fs/xfs/xfs_inode.c43
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iomap.c39
-rw-r--r--fs/xfs/xfs_iops.c16
-rw-r--r--fs/xfs/xfs_log.c22
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_recover.c43
-rw-r--r--fs/xfs/xfs_message.h42
-rw-r--r--fs/xfs/xfs_mount.c36
-rw-r--r--fs/xfs/xfs_qm.c12
-rw-r--r--fs/xfs/xfs_qm_syscalls.c2
-rw-r--r--fs/xfs/xfs_rename.c8
-rw-r--r--fs/xfs/xfs_rtalloc.c48
-rw-r--r--fs/xfs/xfs_rw.c23
-rw-r--r--fs/xfs/xfs_rw.h2
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_sync.c16
-rw-r--r--fs/xfs/xfs_trace.h39
-rw-r--r--fs/xfs/xfs_trans.c13
-rw-r--r--fs/xfs/xfs_trans.h14
-rw-r--r--fs/xfs/xfs_trans_ail.c43
-rw-r--r--fs/xfs/xfs_trans_buf.c24
-rw-r--r--fs/xfs/xfs_trans_inode.c25
-rw-r--r--fs/xfs/xfs_trans_priv.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c123
381 files changed, 19203 insertions, 11850 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef966188611..2b78014a124 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
132 options = tmp_options; 132 options = tmp_options;
133 133
134 while ((p = strsep(&options, ",")) != NULL) { 134 while ((p = strsep(&options, ",")) != NULL) {
135 int token; 135 int token, r;
136 if (!*p) 136 if (!*p)
137 continue; 137 continue;
138 token = match_token(p, tokens, args); 138 token = match_token(p, tokens, args);
139 if (token < Opt_uname) { 139 switch (token) {
140 int r = match_int(&args[0], &option); 140 case Opt_debug:
141 r = match_int(&args[0], &option);
141 if (r < 0) { 142 if (r < 0) {
142 P9_DPRINTK(P9_DEBUG_ERROR, 143 P9_DPRINTK(P9_DEBUG_ERROR,
143 "integer field, but no integer?\n"); 144 "integer field, but no integer?\n");
144 ret = r; 145 ret = r;
145 continue; 146 continue;
146 } 147 }
147 }
148 switch (token) {
149 case Opt_debug:
150 v9ses->debug = option; 148 v9ses->debug = option;
151#ifdef CONFIG_NET_9P_DEBUG 149#ifdef CONFIG_NET_9P_DEBUG
152 p9_debug_level = option; 150 p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
154 break; 152 break;
155 153
156 case Opt_dfltuid: 154 case Opt_dfltuid:
155 r = match_int(&args[0], &option);
156 if (r < 0) {
157 P9_DPRINTK(P9_DEBUG_ERROR,
158 "integer field, but no integer?\n");
159 ret = r;
160 continue;
161 }
157 v9ses->dfltuid = option; 162 v9ses->dfltuid = option;
158 break; 163 break;
159 case Opt_dfltgid: 164 case Opt_dfltgid:
165 r = match_int(&args[0], &option);
166 if (r < 0) {
167 P9_DPRINTK(P9_DEBUG_ERROR,
168 "integer field, but no integer?\n");
169 ret = r;
170 continue;
171 }
160 v9ses->dfltgid = option; 172 v9ses->dfltgid = option;
161 break; 173 break;
162 case Opt_afid: 174 case Opt_afid:
175 r = match_int(&args[0], &option);
176 if (r < 0) {
177 P9_DPRINTK(P9_DEBUG_ERROR,
178 "integer field, but no integer?\n");
179 ret = r;
180 continue;
181 }
163 v9ses->afid = option; 182 v9ses->afid = option;
164 break; 183 break;
165 case Opt_uname: 184 case Opt_uname:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9..598fff1a54e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
165 } 165 }
166 while (rdir->head < rdir->tail) { 166 while (rdir->head < rdir->tail) {
167 p9stat_init(&st); 167 p9stat_init(&st);
168 err = p9stat_read(rdir->buf + rdir->head, 168 err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
169 rdir->tail - rdir->head, &st, 169 rdir->tail - rdir->head, &st);
170 fid->clnt->proto_version);
171 if (err) { 170 if (err) {
172 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 171 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
173 err = -EIO; 172 err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
231 while (err == 0) { 230 while (err == 0) {
232 if (rdir->tail == rdir->head) { 231 if (rdir->tail == rdir->head) {
233 err = p9_client_readdir(fid, rdir->buf, buflen, 232 err = p9_client_readdir(fid, rdir->buf, buflen,
234 filp->f_pos); 233 filp->f_pos);
235 if (err <= 0) 234 if (err <= 0)
236 goto unlock_and_exit; 235 goto unlock_and_exit;
237 236
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
241 240
242 while (rdir->head < rdir->tail) { 241 while (rdir->head < rdir->tail) {
243 242
244 err = p9dirent_read(rdir->buf + rdir->head, 243 err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
245 rdir->tail - rdir->head, 244 rdir->tail - rdir->head,
246 &curdirent, 245 &curdirent);
247 fid->clnt->proto_version);
248 if (err < 0) { 246 if (err < 0) {
249 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 247 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
250 err = -EIO; 248 err = -EIO;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e3c03db3c78..879ed885173 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
278 case S_IFSOCK: 278 case S_IFSOCK:
279 if (v9fs_proto_dotl(v9ses)) { 279 if (v9fs_proto_dotl(v9ses)) {
280 inode->i_op = &v9fs_file_inode_operations_dotl; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else if (v9fs_proto_dotu(v9ses)) { 281 } else if (v9fs_proto_dotu(v9ses)) {
283 inode->i_op = &v9fs_file_inode_operations; 282 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 } else { 283 } else {
286 P9_DPRINTK(P9_DEBUG_ERROR, 284 P9_DPRINTK(P9_DEBUG_ERROR,
287 "special files without extended mode\n"); 285 "special files without extended mode\n");
@@ -1140,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1140 struct v9fs_session_info *v9ses = sb->s_fs_info; 1138 struct v9fs_session_info *v9ses = sb->s_fs_info;
1141 struct v9fs_inode *v9inode = V9FS_I(inode); 1139 struct v9fs_inode *v9inode = V9FS_I(inode);
1142 1140
1143 inode->i_nlink = 1; 1141 set_nlink(inode, 1);
1144 1142
1145 inode->i_atime.tv_sec = stat->atime; 1143 inode->i_atime.tv_sec = stat->atime;
1146 inode->i_mtime.tv_sec = stat->mtime; 1144 inode->i_mtime.tv_sec = stat->mtime;
@@ -1166,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1166 /* HARDLINKCOUNT %u */ 1164 /* HARDLINKCOUNT %u */
1167 sscanf(ext, "%13s %u", tag_name, &i_nlink); 1165 sscanf(ext, "%13s %u", tag_name, &i_nlink);
1168 if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) 1166 if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
1169 inode->i_nlink = i_nlink; 1167 set_nlink(inode, i_nlink);
1170 } 1168 }
1171 } 1169 }
1172 mode = stat->mode & S_IALLUGO; 1170 mode = stat->mode & S_IALLUGO;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index aded79fcd5c..0b5745e2194 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
606 inode->i_ctime.tv_nsec = stat->st_ctime_nsec; 606 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
607 inode->i_uid = stat->st_uid; 607 inode->i_uid = stat->st_uid;
608 inode->i_gid = stat->st_gid; 608 inode->i_gid = stat->st_gid;
609 inode->i_nlink = stat->st_nlink; 609 set_nlink(inode, stat->st_nlink);
610 610
611 mode = stat->st_mode & S_IALLUGO; 611 mode = stat->st_mode & S_IALLUGO;
612 mode |= inode->i_mode & ~S_IALLUGO; 612 mode |= inode->i_mode & ~S_IALLUGO;
@@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
632 if (stat->st_result_mask & P9_STATS_GID) 632 if (stat->st_result_mask & P9_STATS_GID)
633 inode->i_gid = stat->st_gid; 633 inode->i_gid = stat->st_gid;
634 if (stat->st_result_mask & P9_STATS_NLINK) 634 if (stat->st_result_mask & P9_STATS_NLINK)
635 inode->i_nlink = stat->st_nlink; 635 set_nlink(inode, stat->st_nlink);
636 if (stat->st_result_mask & P9_STATS_MODE) { 636 if (stat->st_result_mask & P9_STATS_MODE) {
637 inode->i_mode = stat->st_mode; 637 inode->i_mode = stat->st_mode;
638 if ((S_ISBLK(inode->i_mode)) || 638 if ((S_ISBLK(inode->i_mode)) ||
diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b349f4c..5f4c45d4aa1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,7 +109,7 @@ source "fs/proc/Kconfig"
109source "fs/sysfs/Kconfig" 109source "fs/sysfs/Kconfig"
110 110
111config TMPFS 111config TMPFS
112 bool "Virtual memory file system support (former shm fs)" 112 bool "Tmpfs virtual memory file system support (former shm fs)"
113 depends on SHMEM 113 depends on SHMEM
114 help 114 help
115 Tmpfs is a file system which keeps all files in virtual memory. 115 Tmpfs is a file system which keeps all files in virtual memory.
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9..d2c3353d547 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
120obj-$(CONFIG_OCFS2_FS) += ocfs2/ 120obj-$(CONFIG_OCFS2_FS) += ocfs2/
121obj-$(CONFIG_BTRFS_FS) += btrfs/ 121obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 122obj-$(CONFIG_GFS2_FS) += gfs2/
123obj-$(CONFIG_EXOFS_FS) += exofs/ 123obj-y += exofs/ # Multiple modules
124obj-$(CONFIG_CEPH_FS) += ceph/ 124obj-$(CONFIG_CEPH_FS) += ceph/
125obj-$(CONFIG_PSTORE) += pstore/ 125obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index d5250c5aae2..1dab6a174d6 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
247 inode->i_gid = ADFS_SB(sb)->s_gid; 247 inode->i_gid = ADFS_SB(sb)->s_gid;
248 inode->i_ino = obj->file_id; 248 inode->i_ino = obj->file_id;
249 inode->i_size = obj->size; 249 inode->i_size = obj->size;
250 inode->i_nlink = 2; 250 set_nlink(inode, 2);
251 inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> 251 inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >>
252 sb->s_blocksize_bits; 252 sb->s_blocksize_bits;
253 253
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 3a4557e8325..de37ec84234 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry)
215 break; 215 break;
216 default: 216 default:
217 if (!AFFS_TAIL(sb, bh)->link_chain) 217 if (!AFFS_TAIL(sb, bh)->link_chain)
218 inode->i_nlink = 1; 218 set_nlink(inode, 1);
219 } 219 }
220 affs_free_block(sb, link_ino); 220 affs_free_block(sb, link_ino);
221 goto done; 221 goto done;
@@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry)
316 if (inode->i_nlink > 1) 316 if (inode->i_nlink > 1)
317 retval = affs_remove_link(dentry); 317 retval = affs_remove_link(dentry);
318 else 318 else
319 inode->i_nlink = 0; 319 clear_nlink(inode);
320 affs_unlock_link(inode); 320 affs_unlock_link(inode);
321 inode->i_ctime = CURRENT_TIME_SEC; 321 inode->i_ctime = CURRENT_TIME_SEC;
322 mark_inode_dirty(inode); 322 mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5d828903ac6..88a4b0b5005 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
54 prot = be32_to_cpu(tail->protect); 54 prot = be32_to_cpu(tail->protect);
55 55
56 inode->i_size = 0; 56 inode->i_size = 0;
57 inode->i_nlink = 1; 57 set_nlink(inode, 1);
58 inode->i_mode = 0; 58 inode->i_mode = 0;
59 AFFS_I(inode)->i_extcnt = 1; 59 AFFS_I(inode)->i_extcnt = 1;
60 AFFS_I(inode)->i_ext_last = ~1; 60 AFFS_I(inode)->i_ext_last = ~1;
@@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
137 sbi->s_hashsize + 1; 137 sbi->s_hashsize + 1;
138 } 138 }
139 if (tail->link_chain) 139 if (tail->link_chain)
140 inode->i_nlink = 2; 140 set_nlink(inode, 2);
141 inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; 141 inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
142 inode->i_op = &affs_file_inode_operations; 142 inode->i_op = &affs_file_inode_operations;
143 inode->i_fop = &affs_file_operations; 143 inode->i_fop = &affs_file_operations;
@@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir)
304 inode->i_uid = current_fsuid(); 304 inode->i_uid = current_fsuid();
305 inode->i_gid = current_fsgid(); 305 inode->i_gid = current_fsgid();
306 inode->i_ino = block; 306 inode->i_ino = block;
307 inode->i_nlink = 1; 307 set_nlink(inode, 1);
308 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 308 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
309 atomic_set(&AFFS_I(inode)->i_opencnt, 0); 309 atomic_set(&AFFS_I(inode)->i_opencnt, 0);
310 AFFS_I(inode)->i_blkcnt = 0; 310 AFFS_I(inode)->i_blkcnt = 0;
@@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
387 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); 387 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); 388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
389 mark_buffer_dirty_inode(inode_bh, inode); 389 mark_buffer_dirty_inode(inode_bh, inode);
390 inode->i_nlink = 2; 390 set_nlink(inode, 2);
391 ihold(inode); 391 ihold(inode);
392 } 392 }
393 affs_fix_checksum(sb, bh); 393 affs_fix_checksum(sb, bh);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd..780a11dc631 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
277 inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; 277 inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
278 error = affs_add_entry(dir, inode, dentry, ST_FILE); 278 error = affs_add_entry(dir, inode, dentry, ST_FILE);
279 if (error) { 279 if (error) {
280 inode->i_nlink = 0; 280 clear_nlink(inode);
281 iput(inode); 281 iput(inode);
282 return error; 282 return error;
283 } 283 }
@@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
305 305
306 error = affs_add_entry(dir, inode, dentry, ST_USERDIR); 306 error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
307 if (error) { 307 if (error) {
308 inode->i_nlink = 0; 308 clear_nlink(inode);
309 mark_inode_dirty(inode); 309 mark_inode_dirty(inode);
310 iput(inode); 310 iput(inode);
311 return error; 311 return error;
@@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
392 return 0; 392 return 0;
393 393
394err: 394err:
395 inode->i_nlink = 0; 395 clear_nlink(inode);
396 mark_inode_dirty(inode); 396 mark_inode_dirty(inode);
397 iput(inode); 397 iput(inode);
398 return error; 398 return error;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 346e3289abd..2f213d109c2 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
90 vnode->vfs_inode.i_uid = status->owner; 90 vnode->vfs_inode.i_uid = status->owner;
91 vnode->vfs_inode.i_gid = status->group; 91 vnode->vfs_inode.i_gid = status->group;
92 vnode->vfs_inode.i_generation = vnode->fid.unique; 92 vnode->vfs_inode.i_generation = vnode->fid.unique;
93 vnode->vfs_inode.i_nlink = status->nlink; 93 set_nlink(&vnode->vfs_inode, status->nlink);
94 94
95 mode = vnode->vfs_inode.i_mode; 95 mode = vnode->vfs_inode.i_mode;
96 mode &= ~S_IALLUGO; 96 mode &= ~S_IALLUGO;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fdab6e03d8..d890ae3b2ce 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
67 fscache_attr_changed(vnode->cache); 67 fscache_attr_changed(vnode->cache);
68#endif 68#endif
69 69
70 inode->i_nlink = vnode->status.nlink; 70 set_nlink(inode, vnode->status.nlink);
71 inode->i_uid = vnode->status.owner; 71 inode->i_uid = vnode->status.owner;
72 inode->i_gid = 0; 72 inode->i_gid = 0;
73 inode->i_size = vnode->status.size; 73 inode->i_size = vnode->status.size;
@@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
174 inode->i_size = 0; 174 inode->i_size = 0;
175 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 175 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
176 inode->i_op = &afs_autocell_inode_operations; 176 inode->i_op = &afs_autocell_inode_operations;
177 inode->i_nlink = 2; 177 set_nlink(inode, 2);
178 inode->i_uid = 0; 178 inode->i_uid = 0;
179 inode->i_gid = 0; 179 inode->i_gid = 0;
180 inode->i_ctime.tv_sec = get_seconds(); 180 inode->i_ctime.tv_sec = get_seconds();
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af2..78c514cfd21 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm)
440static struct kiocb *__aio_get_req(struct kioctx *ctx) 440static struct kiocb *__aio_get_req(struct kioctx *ctx)
441{ 441{
442 struct kiocb *req = NULL; 442 struct kiocb *req = NULL;
443 struct aio_ring *ring;
444 int okay = 0;
445 443
446 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); 444 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
447 if (unlikely(!req)) 445 if (unlikely(!req))
@@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
459 INIT_LIST_HEAD(&req->ki_run_list); 457 INIT_LIST_HEAD(&req->ki_run_list);
460 req->ki_eventfd = NULL; 458 req->ki_eventfd = NULL;
461 459
462 /* Check if the completion queue has enough free space to 460 return req;
463 * accept an event from this io. 461}
464 */ 462
463/*
464 * struct kiocb's are allocated in batches to reduce the number of
465 * times the ctx lock is acquired and released.
466 */
467#define KIOCB_BATCH_SIZE 32L
468struct kiocb_batch {
469 struct list_head head;
470 long count; /* number of requests left to allocate */
471};
472
473static void kiocb_batch_init(struct kiocb_batch *batch, long total)
474{
475 INIT_LIST_HEAD(&batch->head);
476 batch->count = total;
477}
478
479static void kiocb_batch_free(struct kiocb_batch *batch)
480{
481 struct kiocb *req, *n;
482
483 list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
484 list_del(&req->ki_batch);
485 kmem_cache_free(kiocb_cachep, req);
486 }
487}
488
489/*
490 * Allocate a batch of kiocbs. This avoids taking and dropping the
491 * context lock a lot during setup.
492 */
493static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
494{
495 unsigned short allocated, to_alloc;
496 long avail;
497 bool called_fput = false;
498 struct kiocb *req, *n;
499 struct aio_ring *ring;
500
501 to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
502 for (allocated = 0; allocated < to_alloc; allocated++) {
503 req = __aio_get_req(ctx);
504 if (!req)
505 /* allocation failed, go with what we've got */
506 break;
507 list_add(&req->ki_batch, &batch->head);
508 }
509
510 if (allocated == 0)
511 goto out;
512
513retry:
465 spin_lock_irq(&ctx->ctx_lock); 514 spin_lock_irq(&ctx->ctx_lock);
466 ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); 515 ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
467 if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { 516
517 avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
518 BUG_ON(avail < 0);
519 if (avail == 0 && !called_fput) {
520 /*
521 * Handle a potential starvation case. It is possible that
522 * we hold the last reference on a struct file, causing us
523 * to delay the final fput to non-irq context. In this case,
524 * ctx->reqs_active is artificially high. Calling the fput
525 * routine here may free up a slot in the event completion
526 * ring, allowing this allocation to succeed.
527 */
528 kunmap_atomic(ring);
529 spin_unlock_irq(&ctx->ctx_lock);
530 aio_fput_routine(NULL);
531 called_fput = true;
532 goto retry;
533 }
534
535 if (avail < allocated) {
536 /* Trim back the number of requests. */
537 list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
538 list_del(&req->ki_batch);
539 kmem_cache_free(kiocb_cachep, req);
540 if (--allocated <= avail)
541 break;
542 }
543 }
544
545 batch->count -= allocated;
546 list_for_each_entry(req, &batch->head, ki_batch) {
468 list_add(&req->ki_list, &ctx->active_reqs); 547 list_add(&req->ki_list, &ctx->active_reqs);
469 ctx->reqs_active++; 548 ctx->reqs_active++;
470 okay = 1;
471 } 549 }
472 kunmap_atomic(ring, KM_USER0);
473 spin_unlock_irq(&ctx->ctx_lock);
474 550
475 if (!okay) { 551 kunmap_atomic(ring);
476 kmem_cache_free(kiocb_cachep, req); 552 spin_unlock_irq(&ctx->ctx_lock);
477 req = NULL;
478 }
479 553
480 return req; 554out:
555 return allocated;
481} 556}
482 557
483static inline struct kiocb *aio_get_req(struct kioctx *ctx) 558static inline struct kiocb *aio_get_req(struct kioctx *ctx,
559 struct kiocb_batch *batch)
484{ 560{
485 struct kiocb *req; 561 struct kiocb *req;
486 /* Handle a potential starvation case -- should be exceedingly rare as 562
487 * requests will be stuck on fput_head only if the aio_fput_routine is 563 if (list_empty(&batch->head))
488 * delayed and the requests were the last user of the struct file. 564 if (kiocb_batch_refill(ctx, batch) == 0)
489 */ 565 return NULL;
490 req = __aio_get_req(ctx); 566 req = list_first_entry(&batch->head, struct kiocb, ki_batch);
491 if (unlikely(NULL == req)) { 567 list_del(&req->ki_batch);
492 aio_fput_routine(NULL);
493 req = __aio_get_req(ctx);
494 }
495 return req; 568 return req;
496} 569}
497 570
@@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1387 ret = compat_rw_copy_check_uvector(type, 1460 ret = compat_rw_copy_check_uvector(type,
1388 (struct compat_iovec __user *)kiocb->ki_buf, 1461 (struct compat_iovec __user *)kiocb->ki_buf,
1389 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1462 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1390 &kiocb->ki_iovec); 1463 &kiocb->ki_iovec, 1);
1391 else 1464 else
1392#endif 1465#endif
1393 ret = rw_copy_check_uvector(type, 1466 ret = rw_copy_check_uvector(type,
1394 (struct iovec __user *)kiocb->ki_buf, 1467 (struct iovec __user *)kiocb->ki_buf,
1395 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1468 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1396 &kiocb->ki_iovec); 1469 &kiocb->ki_iovec, 1);
1397 if (ret < 0) 1470 if (ret < 0)
1398 goto out; 1471 goto out;
1399 1472
@@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1515} 1588}
1516 1589
1517static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1590static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1518 struct iocb *iocb, bool compat) 1591 struct iocb *iocb, struct kiocb_batch *batch,
1592 bool compat)
1519{ 1593{
1520 struct kiocb *req; 1594 struct kiocb *req;
1521 struct file *file; 1595 struct file *file;
@@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1541 if (unlikely(!file)) 1615 if (unlikely(!file))
1542 return -EBADF; 1616 return -EBADF;
1543 1617
1544 req = aio_get_req(ctx); /* returns with 2 references to req */ 1618 req = aio_get_req(ctx, batch); /* returns with 2 references to req */
1545 if (unlikely(!req)) { 1619 if (unlikely(!req)) {
1546 fput(file); 1620 fput(file);
1547 return -EAGAIN; 1621 return -EAGAIN;
@@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1621{ 1695{
1622 struct kioctx *ctx; 1696 struct kioctx *ctx;
1623 long ret = 0; 1697 long ret = 0;
1624 int i; 1698 int i = 0;
1625 struct blk_plug plug; 1699 struct blk_plug plug;
1700 struct kiocb_batch batch;
1626 1701
1627 if (unlikely(nr < 0)) 1702 if (unlikely(nr < 0))
1628 return -EINVAL; 1703 return -EINVAL;
@@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1639 return -EINVAL; 1714 return -EINVAL;
1640 } 1715 }
1641 1716
1717 kiocb_batch_init(&batch, nr);
1718
1642 blk_start_plug(&plug); 1719 blk_start_plug(&plug);
1643 1720
1644 /* 1721 /*
@@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1659 break; 1736 break;
1660 } 1737 }
1661 1738
1662 ret = io_submit_one(ctx, user_iocb, &tmp, compat); 1739 ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
1663 if (ret) 1740 if (ret)
1664 break; 1741 break;
1665 } 1742 }
1666 blk_finish_plug(&plug); 1743 blk_finish_plug(&plug);
1667 1744
1745 kiocb_batch_free(&batch);
1668 put_ioctx(ctx); 1746 put_ioctx(ctx);
1669 return i ? i : ret; 1747 return i ? i : ret;
1670} 1748}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 180fa2425e4..8179f1ab817 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
342 inode->i_ino = get_next_ino(); 342 inode->i_ino = get_next_ino();
343 343
344 if (S_ISDIR(mode)) { 344 if (S_ISDIR(mode)) {
345 inode->i_nlink = 2; 345 set_nlink(inode, 2);
346 inode->i_op = &autofs4_dir_inode_operations; 346 inode->i_op = &autofs4_dir_inode_operations;
347 inode->i_fop = &autofs4_dir_operations; 347 inode->i_fop = &autofs4_dir_operations;
348 } else if (S_ISLNK(mode)) { 348 } else if (S_ISLNK(mode)) {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 720d885e8dc..8342ca67abc 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
357 inode->i_gid = befs_sb->mount_opts.use_gid ? 357 inode->i_gid = befs_sb->mount_opts.use_gid ?
358 befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); 358 befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
359 359
360 inode->i_nlink = 1; 360 set_nlink(inode, 1);
361 361
362 /* 362 /*
363 * BEFS's time is 64 bits, but current VFS is 32 bits... 363 * BEFS's time is 64 bits, but current VFS is 32 bits...
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd904..9cc07401947 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
199 printf("unlinking non-existent file %s:%lu (nlink=%d)\n", 199 printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
200 inode->i_sb->s_id, inode->i_ino, 200 inode->i_sb->s_id, inode->i_ino,
201 inode->i_nlink); 201 inode->i_nlink);
202 inode->i_nlink = 1; 202 set_nlink(inode, 1);
203 } 203 }
204 de->ino = 0; 204 de->ino = 0;
205 mark_buffer_dirty_inode(bh, dir); 205 mark_buffer_dirty_inode(bh, dir);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index a8e37f81d09..697af5bf70b 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
78 BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); 78 BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
79 inode->i_uid = le32_to_cpu(di->i_uid); 79 inode->i_uid = le32_to_cpu(di->i_uid);
80 inode->i_gid = le32_to_cpu(di->i_gid); 80 inode->i_gid = le32_to_cpu(di->i_gid);
81 inode->i_nlink = le32_to_cpu(di->i_nlink); 81 set_nlink(inode, le32_to_cpu(di->i_nlink));
82 inode->i_size = BFS_FILESIZE(di); 82 inode->i_size = BFS_FILESIZE(di);
83 inode->i_blocks = BFS_FILEBLOCKS(di); 83 inode->i_blocks = BFS_FILEBLOCKS(di);
84 inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); 84 inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc56d3..21ac5ee4b43 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
795 * might try to exec. This is because the brk will 795 * might try to exec. This is because the brk will
796 * follow the loader, and is not movable. */ 796 * follow the loader, and is not movable. */
797#if defined(CONFIG_X86) || defined(CONFIG_ARM) 797#if defined(CONFIG_X86) || defined(CONFIG_ARM)
798 load_bias = 0; 798 /* Memory randomization might have been switched off
799 * in runtime via sysctl.
800 * If that is the case, retain the original non-zero
801 * load_bias value in order to establish proper
802 * non-randomized mappings.
803 */
804 if (current->flags & PF_RANDOMIZE)
805 load_bias = 0;
806 else
807 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
799#else 808#else
800 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 809 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
801#endif 810#endif
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ba1a1ae4a18..1e9edbdeda7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -521,7 +521,7 @@ static void kill_node(Node *e)
521 write_unlock(&entries_lock); 521 write_unlock(&entries_lock);
522 522
523 if (dentry) { 523 if (dentry) {
524 dentry->d_inode->i_nlink--; 524 drop_nlink(dentry->d_inode);
525 d_drop(dentry); 525 d_drop(dentry);
526 dput(dentry); 526 dput(dentry);
527 simple_release_fs(&bm_mnt, &entry_count); 527 simple_release_fs(&bm_mnt, &entry_count);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11..c2183f3917c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/export.h>
25#include <linux/bio.h> 26#include <linux/bio.h>
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609..41c93c72224 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
255{ 255{
256 memset(bio, 0, sizeof(*bio)); 256 memset(bio, 0, sizeof(*bio));
257 bio->bi_flags = 1 << BIO_UPTODATE; 257 bio->bi_flags = 1 << BIO_UPTODATE;
258 bio->bi_comp_cpu = -1;
259 atomic_set(&bio->bi_cnt, 1); 258 atomic_set(&bio->bi_cnt, 1);
260} 259}
261EXPORT_SYMBOL(bio_init); 260EXPORT_SYMBOL(bio_init);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f0..b07f1da1de4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
971 971
972 if (!bdev->bd_disk) 972 if (!bdev->bd_disk)
973 return; 973 return;
974 if (disk_partitionable(bdev->bd_disk)) 974 if (disk_part_scan_enabled(bdev->bd_disk))
975 bdev->bd_invalidated = 1; 975 bdev->bd_invalidated = 1;
976} 976}
977 977
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1086{ 1086{
1087 struct gendisk *disk; 1087 struct gendisk *disk;
1088 struct module *owner;
1088 int ret; 1089 int ret;
1089 int partno; 1090 int partno;
1090 int perm = 0; 1091 int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1110 disk = get_gendisk(bdev->bd_dev, &partno); 1111 disk = get_gendisk(bdev->bd_dev, &partno);
1111 if (!disk) 1112 if (!disk)
1112 goto out; 1113 goto out;
1114 owner = disk->fops->owner;
1113 1115
1114 disk_block_events(disk); 1116 disk_block_events(disk);
1115 mutex_lock_nested(&bdev->bd_mutex, for_part); 1117 mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1137 bdev->bd_disk = NULL; 1139 bdev->bd_disk = NULL;
1138 mutex_unlock(&bdev->bd_mutex); 1140 mutex_unlock(&bdev->bd_mutex);
1139 disk_unblock_events(disk); 1141 disk_unblock_events(disk);
1140 module_put(disk->fops->owner);
1141 put_disk(disk); 1142 put_disk(disk);
1143 module_put(owner);
1142 goto restart; 1144 goto restart;
1143 } 1145 }
1144 } 1146 }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1194 goto out_unlock_bdev; 1196 goto out_unlock_bdev;
1195 } 1197 }
1196 /* only one opener holds refs to the module and disk */ 1198 /* only one opener holds refs to the module and disk */
1197 module_put(disk->fops->owner);
1198 put_disk(disk); 1199 put_disk(disk);
1200 module_put(owner);
1199 } 1201 }
1200 bdev->bd_openers++; 1202 bdev->bd_openers++;
1201 if (for_part) 1203 if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1215 out_unlock_bdev: 1217 out_unlock_bdev:
1216 mutex_unlock(&bdev->bd_mutex); 1218 mutex_unlock(&bdev->bd_mutex);
1217 disk_unblock_events(disk); 1219 disk_unblock_events(disk);
1218 module_put(disk->fops->owner);
1219 put_disk(disk); 1220 put_disk(disk);
1221 module_put(owner);
1220 out: 1222 out:
1221 bdput(bdev); 1223 bdput(bdev);
1222 1224
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1442 if (!bdev->bd_openers) { 1444 if (!bdev->bd_openers) {
1443 struct module *owner = disk->fops->owner; 1445 struct module *owner = disk->fops->owner;
1444 1446
1445 put_disk(disk);
1446 module_put(owner);
1447 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1448 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1449 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1450 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1451 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1452 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
1453
1454 put_disk(disk);
1455 module_put(owner);
1453 } 1456 }
1454 mutex_unlock(&bdev->bd_mutex); 1457 mutex_unlock(&bdev->bd_mutex);
1455 bdput(bdev); 1458 bdput(bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21..c0ddfd29c5e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a1..89b156d85d6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 00000000000..8855aad3929
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 00000000000..92618837cb8
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd..634608d2a6d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f173..14f1c5a0b2d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8..0fe615e4ea3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 902
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 904
905 if (level < BTRFS_MAX_LEVEL - 1) 905 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 906 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 907 pslot = path->slots[level + 1];
908 }
908 909
909 /* 910 /*
910 * deal with the case where there is only one pointer in the root 911 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1108 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1109 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1110
1110 if (level < BTRFS_MAX_LEVEL - 1) 1111 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1112 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1113 pslot = path->slots[level + 1];
1114 }
1113 1115
1114 if (!parent) 1116 if (!parent)
1115 return 1; 1117 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f4..b9ba59ff929 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 877 spinlock_t lock;
841 u64 pinned; 878 u64 pinned;
842 u64 reserved; 879 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 880 u64 bytes_super;
845 u64 flags; 881 u64 flags;
846 u64 sectorsize; 882 u64 sectorsize;
883 u64 cache_generation;
847 unsigned int ro:1; 884 unsigned int ro:1;
848 unsigned int dirty:1; 885 unsigned int dirty:1;
849 unsigned int iref:1; 886 unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 936 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 937 struct rb_root block_group_cache_tree;
901 938
939 /* keep track of unallocated space */
940 spinlock_t free_chunk_lock;
941 u64 free_chunk_space;
942
902 struct extent_io_tree freed_extents[2]; 943 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 944 struct extent_io_tree *pinned_extents;
904 945
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 957 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 958 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 959 struct btrfs_block_rsv chunk_block_rsv;
960 /* block reservation for delayed operations */
961 struct btrfs_block_rsv delayed_block_rsv;
919 962
920 struct btrfs_block_rsv empty_block_rsv; 963 struct btrfs_block_rsv empty_block_rsv;
921 964
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 965 u64 generation;
928 u64 last_trans_committed; 966 u64 last_trans_committed;
929 967
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 980 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 981 wait_queue_head_t async_submit_wait;
944 982
945 struct btrfs_super_block super_copy; 983 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 984 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 985 struct block_device *__bdev;
948 struct super_block *sb; 986 struct super_block *sb;
949 struct inode *btree_inode; 987 struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1074 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1075 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1076 struct btrfs_workers caching_workers;
1077 struct btrfs_workers readahead_workers;
1039 1078
1040 /* 1079 /*
1041 * fixup workers take dirty pages that didn't properly go through 1080 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1158 u64 fs_state;
1120 1159
1121 struct btrfs_delayed_root *delayed_root; 1160 struct btrfs_delayed_root *delayed_root;
1161
1162 /* readahead tree */
1163 spinlock_t reada_lock;
1164 struct radix_tree_root reada_tree;
1165
1166 /* next backup root to be overwritten */
1167 int backup_root_index;
1122}; 1168};
1123 1169
1124/* 1170/*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1409#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1410#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1411#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1412#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1413
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1414#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1415#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2025 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2026}
1980 2027
2028/* struct btrfs_root_backup */
2029BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2030 tree_root, 64);
2031BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2032 tree_root_gen, 64);
2033BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2034 tree_root_level, 8);
2035
2036BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2037 chunk_root, 64);
2038BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2039 chunk_root_gen, 64);
2040BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2041 chunk_root_level, 8);
2042
2043BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2044 extent_root, 64);
2045BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2046 extent_root_gen, 64);
2047BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2048 extent_root_level, 8);
2049
2050BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2051 fs_root, 64);
2052BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2053 fs_root_gen, 64);
2054BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2055 fs_root_level, 8);
2056
2057BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2058 dev_root, 64);
2059BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2060 dev_root_gen, 64);
2061BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2062 dev_root_level, 8);
2063
2064BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2065 csum_root, 64);
2066BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2067 csum_root_gen, 64);
2068BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2069 csum_root_level, 8);
2070BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2071 total_bytes, 64);
2072BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2073 bytes_used, 64);
2074BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2075 num_devices, 64);
2076
1981/* struct btrfs_super_block */ 2077/* struct btrfs_super_block */
1982 2078
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2079BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2225 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2226}
2131 2227
2228static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2229{
2230 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2231}
2232
2132/* extent-tree.c */ 2233/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2234static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2235 unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2238 3 * num_items;
2138} 2239}
2139 2240
2241/*
2242 * Doing a truncate won't result in new nodes or leaves, just what we need for
2243 * COW.
2244 */
2245static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2246 unsigned num_items)
2247{
2248 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2249 num_items;
2250}
2251
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2252void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2253int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2254 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2258 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2259int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2260 u64 bytenr, u64 num, int reserved);
2261int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2262 struct btrfs_root *root,
2263 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2264int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2265 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2266 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2311 u64 root_objectid, u64 owner, u64 offset);
2197 2312
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2313int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2314int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2315 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2316int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2317 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2318int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2355struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2356void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2357 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2358int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2359 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2360 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2361int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2362 struct btrfs_block_rsv *block_rsv,
2363 u64 num_bytes);
2364int btrfs_block_rsv_check(struct btrfs_root *root,
2365 struct btrfs_block_rsv *block_rsv, int min_factor);
2366int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2367 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2368 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2370 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2371 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2372void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2373 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2374 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2375int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2376 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2377int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2492 smp_mb();
2380 return fs_info->closing; 2493 return fs_info->closing;
2381} 2494}
2495static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2496{
2497 kfree(fs_info->delayed_root);
2498 kfree(fs_info->extent_root);
2499 kfree(fs_info->tree_root);
2500 kfree(fs_info->chunk_root);
2501 kfree(fs_info->dev_root);
2502 kfree(fs_info->csum_root);
2503 kfree(fs_info->super_copy);
2504 kfree(fs_info->super_for_commit);
2505 kfree(fs_info);
2506}
2382 2507
2383/* root-item.c */ 2508/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2509int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2704int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2705int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2706int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2707void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2708 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2709int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2817int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2818 struct btrfs_scrub_progress *progress);
2699 2819
2820/* reada.c */
2821struct reada_control {
2822 struct btrfs_root *root; /* tree to prefetch */
2823 struct btrfs_key key_start;
2824 struct btrfs_key key_end; /* exclusive */
2825 atomic_t elems;
2826 struct kref refcnt;
2827 wait_queue_head_t wait;
2828};
2829struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2830 struct btrfs_key *start, struct btrfs_key *end);
2831int btrfs_reada_wait(void *handle);
2832void btrfs_reada_detach(void *handle);
2833int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2834 u64 start, int err);
2835
2700#endif 2836#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c1..5b163572e0c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1641,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1641 inode->i_gid = btrfs_stack_inode_gid(inode_item); 1719 inode->i_gid = btrfs_stack_inode_gid(inode_item);
1642 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); 1720 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1643 inode->i_mode = btrfs_stack_inode_mode(inode_item); 1721 inode->i_mode = btrfs_stack_inode_mode(inode_item);
1644 inode->i_nlink = btrfs_stack_inode_nlink(inode_item); 1722 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1645 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1723 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1646 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1724 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1647 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1725 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e1..62afe5c5694 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1705,7 +2007,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1705 sb->s_bdi = &fs_info->bdi; 2007 sb->s_bdi = &fs_info->bdi;
1706 2008
1707 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; 2009 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1708 fs_info->btree_inode->i_nlink = 1; 2010 set_nlink(fs_info->btree_inode, 1);
1709 /* 2011 /*
1710 * we set the i_size on the btree inode to the max possible int. 2012 * we set the i_size on the btree inode to the max possible int.
1711 * the real end of the address space is determined by all of 2013 * the real end of the address space is determined by all of
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 btrfs_start_workers(&fs_info->caching_workers, 1);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2210
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2253 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2254 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2255 sb->s_id);
1942 goto fail_chunk_root; 2256 goto fail_tree_roots;
1943 } 2257 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2258 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2259 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2268 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2269 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2270 sb->s_id);
1957 goto fail_chunk_root; 2271 goto fail_tree_roots;
1958 } 2272 }
1959 2273
1960 btrfs_close_extra_devices(fs_devices); 2274 btrfs_close_extra_devices(fs_devices);
1961 2275
2276retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2277 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2278 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2279 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2281 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2282 btrfs_super_root(disk_super),
1968 blocksize, generation); 2283 blocksize, generation);
1969 if (!tree_root->node) 2284 if (!tree_root->node ||
1970 goto fail_chunk_root; 2285 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2286 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2287 sb->s_id);
1974 goto fail_tree_root; 2288
2289 goto recovery_tree_root;
1975 } 2290 }
2291
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2292 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2293 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2294
1979 ret = find_and_setup_root(tree_root, fs_info, 2295 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2296 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2297 if (ret)
1982 goto fail_tree_root; 2298 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2299 extent_root->track_dirty = 1;
1984 2300
1985 ret = find_and_setup_root(tree_root, fs_info, 2301 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2302 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2303 if (ret)
1988 goto fail_extent_root; 2304 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2305 dev_root->track_dirty = 1;
1990 2306
1991 ret = find_and_setup_root(tree_root, fs_info, 2307 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2308 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2309 if (ret)
1994 goto fail_dev_root; 2310 goto recovery_tree_root;
1995 2311
1996 csum_root->track_dirty = 1; 2312 csum_root->track_dirty = 1;
1997 2313
@@ -2124,22 +2440,13 @@ fail_cleaner:
2124 2440
2125fail_block_groups: 2441fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2442 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2443
2128 free_extent_buffer(csum_root->commit_root); 2444fail_tree_roots:
2129fail_dev_root: 2445 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2446
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2447fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2448 btrfs_stop_workers(&fs_info->generic_worker);
2449 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2450 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2451 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2452 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2459,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2459 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2460 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2461fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2462fail_iput:
2463 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2464
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2465 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2466 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2467fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2468 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2469fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2470 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2471fail:
2167 kfree(extent_root); 2472 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2473 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2474 return ERR_PTR(err);
2475
2476recovery_tree_root:
2477 if (!btrfs_test_opt(tree_root, RECOVERY))
2478 goto fail_tree_roots;
2479
2480 free_root_pointers(fs_info, 0);
2481
2482 /* don't use the log in recovery mode, it won't be valid */
2483 btrfs_set_super_log_root(disk_super, 0);
2484
2485 /* we can't trust the free space cache either */
2486 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2487
2488 ret = next_root_backup(fs_info, fs_info->super_copy,
2489 &num_backups_tried, &backup_index);
2490 if (ret == -1)
2491 goto fail_block_groups;
2492 goto retry_root_backup;
2174} 2493}
2175 2494
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2495static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2657,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2657 int total_errors = 0;
2339 u64 flags; 2658 u64 flags;
2340 2659
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2660 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2661 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2662 backup_super_roots(root->fs_info);
2343 2663
2344 sb = &root->fs_info->super_for_commit; 2664 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2665 dev_item = &sb->dev_item;
2346 2666
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2667 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2865,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2865 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2866 btrfs_run_defrag_inodes(root->fs_info);
2547 2867
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2868 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2869 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2870 *
@@ -2572,6 +2890,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2890 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 2891 }
2574 2892
2893 btrfs_put_block_group_cache(fs_info);
2894
2575 kthread_stop(root->fs_info->transaction_kthread); 2895 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 2896 kthread_stop(root->fs_info->cleaner_kthread);
2577 2897
@@ -2603,7 +2923,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 2923 del_fs_roots(fs_info);
2604 2924
2605 iput(fs_info->btree_inode); 2925 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 2926
2608 btrfs_stop_workers(&fs_info->generic_worker); 2927 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 2928 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2936,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 2936 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 2937 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 2938 btrfs_stop_workers(&fs_info->caching_workers);
2939 btrfs_stop_workers(&fs_info->readahead_workers);
2620 2940
2621 btrfs_close_devices(fs_info->fs_devices); 2941 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2942 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2944,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 2944 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 2945 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 2946
2627 kfree(fs_info->extent_root); 2947 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 2948
2634 return 0; 2949 return 0;
2635} 2950}
@@ -2735,7 +3050,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3050 return ret;
2736} 3051}
2737 3052
2738int btree_lock_page_hook(struct page *page) 3053static int btree_lock_page_hook(struct page *page, void *data,
3054 void (*flush_fn)(void *))
2739{ 3055{
2740 struct inode *inode = page->mapping->host; 3056 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3057 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3068,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3068 if (!eb)
2753 goto out; 3069 goto out;
2754 3070
2755 btrfs_tree_lock(eb); 3071 if (!btrfs_try_tree_write_lock(eb)) {
3072 flush_fn(data);
3073 btrfs_tree_lock(eb);
3074 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3075 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3076
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3077 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3086,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3086 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3087 free_extent_buffer(eb);
2769out: 3088out:
2770 lock_page(page); 3089 if (!trylock_page(page)) {
3090 flush_fn(data);
3091 lock_page(page);
3092 }
2771 return 0; 3093 return 0;
2772} 3094}
2773 3095
@@ -3123,6 +3445,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3445static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3446 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3447 .readpage_end_io_hook = btree_readpage_end_io_hook,
3448 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3449 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3450 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3451 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67..c99d0a8f13f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462..b232150b5b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 482 * we likely hold important locks.
466 */ 483 */
467 if (trans && (!trans->transaction->in_commit) && 484 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 485 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) {
469 spin_lock(&cache->lock); 487 spin_lock(&cache->lock);
470 if (cache->cached != BTRFS_CACHE_NO) { 488 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock); 489 spin_unlock(&cache->lock);
@@ -1770,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1788{
1771 int ret; 1789 int ret;
1772 u64 discarded_bytes = 0; 1790 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1791 struct btrfs_bio *bbio = NULL;
1774 1792
1775 1793
1776 /* Tell the block device(s) that the sectors can be discarded */ 1794 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1795 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1796 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1797 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1798 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1799 int i;
1782 1800
1783 1801
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1802 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1803 if (!stripe->dev->can_discard)
1786 continue; 1804 continue;
1787 1805
@@ -1800,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1818 */
1801 ret = 0; 1819 ret = 0;
1802 } 1820 }
1803 kfree(multi); 1821 kfree(bbio);
1804 } 1822 }
1805 1823
1806 if (actual_bytes) 1824 if (actual_bytes)
@@ -2700,6 +2718,13 @@ again:
2700 goto again; 2718 goto again;
2701 } 2719 }
2702 2720
2721 /* We've already setup this transaction, go ahead and exit */
2722 if (block_group->cache_generation == trans->transid &&
2723 i_size_read(inode)) {
2724 dcs = BTRFS_DC_SETUP;
2725 goto out_put;
2726 }
2727
2703 /* 2728 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2729 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2730 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
2749 if (!ret) 2774 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2775 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2776 btrfs_free_reserved_data_space(inode, num_pages);
2777
2752out_put: 2778out_put:
2753 iput(inode); 2779 iput(inode);
2754out_free: 2780out_free:
2755 btrfs_release_path(path); 2781 btrfs_release_path(path);
2756out: 2782out:
2757 spin_lock(&block_group->lock); 2783 spin_lock(&block_group->lock);
2784 if (!ret)
2785 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2786 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2787 spin_unlock(&block_group->lock);
2760 2788
@@ -3122,16 +3150,13 @@ commit_trans:
3122 return -ENOSPC; 3150 return -ENOSPC;
3123 } 3151 }
3124 data_sinfo->bytes_may_use += bytes; 3152 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3153 spin_unlock(&data_sinfo->lock);
3127 3154
3128 return 0; 3155 return 0;
3129} 3156}
3130 3157
3131/* 3158/*
3132 * called when we are clearing an delalloc extent from the 3159 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3160 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3161void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3162{
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3169 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3170 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3171 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3172 spin_unlock(&data_sinfo->lock);
3149} 3173}
3150 3174
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3189 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3190 int force)
3167{ 3191{
3192 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3193 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3194 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3195 u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3198 return 1;
3174 3199
3175 /* 3200 /*
3201 * We need to take into account the global rsv because for all intents
3202 * and purposes it's used space. Don't worry about locking the
3203 * global_rsv, it doesn't change except when the transaction commits.
3204 */
3205 num_allocated += global_rsv->size;
3206
3207 /*
3176 * in limited mode, we want to have some free space up to 3208 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3209 * about 1% of the FS size.
3178 */ 3210 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3211 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3212 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3213 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3214 div_factor_fine(thresh, 1));
3183 3215
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3231 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3232 return 0;
3201 3233
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3234 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3235
3204 /* 256MB or 5% of the FS */ 3236 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3237 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
3302/* 3334/*
3303 * shrink metadata reservation for delalloc 3335 * shrink metadata reservation for delalloc
3304 */ 3336 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3337static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3338 bool wait_ordered)
3307{ 3339{
3308 struct btrfs_block_rsv *block_rsv; 3340 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3341 struct btrfs_space_info *space_info;
3342 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3343 u64 reserved;
3311 u64 max_reclaim; 3344 u64 max_reclaim;
3312 u64 reclaimed = 0; 3345 u64 reclaimed = 0;
3313 long time_left; 3346 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3347 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3348 int loops = 0;
3316 unsigned long progress; 3349 unsigned long progress;
3317 3350
3351 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3352 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3353 space_info = block_rsv->space_info;
3320 3354
3321 smp_mb(); 3355 smp_mb();
3322 reserved = space_info->bytes_reserved; 3356 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3357 progress = space_info->reservation_progress;
3324 3358
3325 if (reserved == 0) 3359 if (reserved == 0)
@@ -3334,18 +3368,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3368 }
3335 3369
3336 max_reclaim = min(reserved, to_reclaim); 3370 max_reclaim = min(reserved, to_reclaim);
3337 3371 nr_pages = max_t(unsigned long, nr_pages,
3372 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3373 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3374 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3375 smp_mb();
3341 nr_pages = min_t(unsigned long, nr_pages, 3376 nr_pages = min_t(unsigned long, nr_pages,
3342 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3377 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3378 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3379 WB_REASON_FS_FREE_SPACE);
3344 3380
3345 spin_lock(&space_info->lock); 3381 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3382 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3383 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3384 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3385 spin_unlock(&space_info->lock);
3350 3386
3351 loops++; 3387 loops++;
@@ -3356,11 +3392,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3392 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3393 return -EAGAIN;
3358 3394
3359 time_left = schedule_timeout_interruptible(1); 3395 if (wait_ordered && !trans) {
3396 btrfs_wait_ordered_extents(root, 0, 0);
3397 } else {
3398 time_left = schedule_timeout_interruptible(1);
3360 3399
3361 /* We were interrupted, exit */ 3400 /* We were interrupted, exit */
3362 if (time_left) 3401 if (time_left)
3363 break; 3402 break;
3403 }
3364 3404
3365 /* we've kicked the IO a few times, if anything has been freed, 3405 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3406 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3415,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3415 }
3376 3416
3377 } 3417 }
3378 if (reclaimed >= to_reclaim && !trans) 3418
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3419 return reclaimed >= to_reclaim;
3381} 3420}
3382 3421
3383/* 3422/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3423 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3424 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3425 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3426 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3427 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3428 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3429 * get us somewhere and then commit the transaction if it does. Otherwise it
3430 * will return -ENOSPC.
3393 */ 3431 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3432static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3433 struct btrfs_space_info *space_info,
3434 u64 bytes, int force)
3435{
3436 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3437 struct btrfs_trans_handle *trans;
3438
3439 trans = (struct btrfs_trans_handle *)current->journal_info;
3440 if (trans)
3441 return -EAGAIN;
3442
3443 if (force)
3444 goto commit;
3445
3446 /* See if there is enough pinned space to make this reservation */
3447 spin_lock(&space_info->lock);
3448 if (space_info->bytes_pinned >= bytes) {
3449 spin_unlock(&space_info->lock);
3450 goto commit;
3451 }
3452 spin_unlock(&space_info->lock);
3453
3454 /*
3455 * See if there is some space in the delayed insertion reservation for
3456 * this reservation.
3457 */
3458 if (space_info != delayed_rsv->space_info)
3459 return -ENOSPC;
3460
3461 spin_lock(&delayed_rsv->lock);
3462 if (delayed_rsv->size < bytes) {
3463 spin_unlock(&delayed_rsv->lock);
3464 return -ENOSPC;
3465 }
3466 spin_unlock(&delayed_rsv->lock);
3467
3468commit:
3469 trans = btrfs_join_transaction(root);
3470 if (IS_ERR(trans))
3471 return -ENOSPC;
3472
3473 return btrfs_commit_transaction(trans, root);
3474}
3475
3476/**
3477 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3478 * @root - the root we're allocating for
3479 * @block_rsv - the block_rsv we're allocating for
3480 * @orig_bytes - the number of bytes we want
3481 * @flush - wether or not we can flush to make our reservation
3482 *
3483 * This will reserve orgi_bytes number of bytes from the space info associated
3484 * with the block_rsv. If there is not enough space it will make an attempt to
3485 * flush out space to make room. It will do this by flushing delalloc if
3486 * possible or committing the transaction. If flush is 0 then no attempts to
3487 * regain reservations will be made and this will fail if there is not enough
3488 * space already.
3489 */
3490static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3491 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3492 u64 orig_bytes, int flush)
3398{ 3493{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3494 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3495 u64 used;
3401 u64 num_bytes = orig_bytes; 3496 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3497 int retries = 0;
3403 int ret = 0; 3498 int ret = 0;
3404 bool committed = false; 3499 bool committed = false;
3405 bool flushing = false; 3500 bool flushing = false;
3501 bool wait_ordered = false;
3406 3502
3407again: 3503again:
3408 ret = 0; 3504 ret = 0;
@@ -3419,7 +3515,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3515 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3516 * hold the current transaction open.
3421 */ 3517 */
3422 if (trans) 3518 if (current->journal_info)
3423 return -EAGAIN; 3519 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3520 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3521 !space_info->flush);
@@ -3431,9 +3527,9 @@ again:
3431 } 3527 }
3432 3528
3433 ret = -ENOSPC; 3529 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3530 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3531 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3532 space_info->bytes_may_use;
3437 3533
3438 /* 3534 /*
3439 * The idea here is that we've not already over-reserved the block group 3535 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3538,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3538 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3539 * our reservation.
3444 */ 3540 */
3445 if (unused <= space_info->total_bytes) { 3541 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3542 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3543 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3544 ret = 0;
3450 } else { 3545 } else {
3451 /* 3546 /*
@@ -3461,10 +3556,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3556 * amount plus the amount of bytes that we need for this
3462 * reservation. 3557 * reservation.
3463 */ 3558 */
3464 num_bytes = unused - space_info->total_bytes + 3559 wait_ordered = true;
3560 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3561 (orig_bytes * (retries + 1));
3466 } 3562 }
3467 3563
3564 if (ret) {
3565 u64 profile = btrfs_get_alloc_profile(root, 0);
3566 u64 avail;
3567
3568 /*
3569 * If we have a lot of space that's pinned, don't bother doing
3570 * the overcommit dance yet and just commit the transaction.
3571 */
3572 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3573 do_div(avail, 10);
3574 if (space_info->bytes_pinned >= avail && flush && !committed) {
3575 space_info->flush = 1;
3576 flushing = true;
3577 spin_unlock(&space_info->lock);
3578 ret = may_commit_transaction(root, space_info,
3579 orig_bytes, 1);
3580 if (ret)
3581 goto out;
3582 committed = true;
3583 goto again;
3584 }
3585
3586 spin_lock(&root->fs_info->free_chunk_lock);
3587 avail = root->fs_info->free_chunk_space;
3588
3589 /*
3590 * If we have dup, raid1 or raid10 then only half of the free
3591 * space is actually useable.
3592 */
3593 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3594 BTRFS_BLOCK_GROUP_RAID1 |
3595 BTRFS_BLOCK_GROUP_RAID10))
3596 avail >>= 1;
3597
3598 /*
3599 * If we aren't flushing don't let us overcommit too much, say
3600 * 1/8th of the space. If we can flush, let it overcommit up to
3601 * 1/2 of the space.
3602 */
3603 if (flush)
3604 avail >>= 3;
3605 else
3606 avail >>= 1;
3607 spin_unlock(&root->fs_info->free_chunk_lock);
3608
3609 if (used + num_bytes < space_info->total_bytes + avail) {
3610 space_info->bytes_may_use += orig_bytes;
3611 ret = 0;
3612 } else {
3613 wait_ordered = true;
3614 }
3615 }
3616
3468 /* 3617 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3618 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3619 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3633,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3633 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3634 * metadata until after the IO is completed.
3486 */ 3635 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3636 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3637 if (ret < 0)
3489 goto out; 3638 goto out;
3490 3639
@@ -3496,35 +3645,17 @@ again:
3496 * so go back around and try again. 3645 * so go back around and try again.
3497 */ 3646 */
3498 if (retries < 2) { 3647 if (retries < 2) {
3648 wait_ordered = true;
3499 retries++; 3649 retries++;
3500 goto again; 3650 goto again;
3501 } 3651 }
3502 3652
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3653 ret = -ENOSPC;
3519 if (committed) 3654 if (committed)
3520 goto out; 3655 goto out;
3521 3656
3522 trans = btrfs_join_transaction(root); 3657 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3658 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3659 committed = true;
3529 goto again; 3660 goto again;
3530 } 3661 }
@@ -3542,10 +3673,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3673static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3674 struct btrfs_root *root)
3544{ 3675{
3545 struct btrfs_block_rsv *block_rsv; 3676 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3677
3678 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3679 block_rsv = trans->block_rsv;
3548 else 3680
3681 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3682 block_rsv = root->block_rsv;
3550 3683
3551 if (!block_rsv) 3684 if (!block_rsv)
@@ -3616,7 +3749,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3749 }
3617 if (num_bytes) { 3750 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3751 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3752 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3753 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3754 spin_unlock(&space_info->lock);
3622 } 3755 }
@@ -3640,9 +3773,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3773{
3641 memset(rsv, 0, sizeof(*rsv)); 3774 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3775 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3776}
3647 3777
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3778struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3793,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3793void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3794 struct btrfs_block_rsv *rsv)
3665{ 3795{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3796 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3797 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3798}
3686 3799
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3800static inline int __block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root, 3801 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3802 u64 num_bytes, int flush)
3690 u64 num_bytes)
3691{ 3803{
3692 int ret; 3804 int ret;
3693 3805
3694 if (num_bytes == 0) 3806 if (num_bytes == 0)
3695 return 0; 3807 return 0;
3696 3808
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3809 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3810 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3811 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3812 return 0;
@@ -3703,55 +3815,66 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3815 return ret;
3704} 3816}
3705 3817
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3818int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3819 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3820 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3821{
3822 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3823}
3824
3825int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3826 struct btrfs_block_rsv *block_rsv,
3827 u64 num_bytes)
3828{
3829 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3830}
3831
3832int btrfs_block_rsv_check(struct btrfs_root *root,
3833 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3834{
3711 u64 num_bytes = 0; 3835 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3836 int ret = -ENOSPC;
3714 3837
3715 if (!block_rsv) 3838 if (!block_rsv)
3716 return 0; 3839 return 0;
3717 3840
3718 spin_lock(&block_rsv->lock); 3841 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3842 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3843 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3844 ret = 0;
3722 num_bytes = min_reserved; 3845 spin_unlock(&block_rsv->lock);
3723 3846
3724 if (block_rsv->reserved >= num_bytes) { 3847 return ret;
3848}
3849
3850int btrfs_block_rsv_refill(struct btrfs_root *root,
3851 struct btrfs_block_rsv *block_rsv,
3852 u64 min_reserved)
3853{
3854 u64 num_bytes = 0;
3855 int ret = -ENOSPC;
3856
3857 if (!block_rsv)
3858 return 0;
3859
3860 spin_lock(&block_rsv->lock);
3861 num_bytes = min_reserved;
3862 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3863 ret = 0;
3726 } else { 3864 else
3727 num_bytes -= block_rsv->reserved; 3865 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3866 spin_unlock(&block_rsv->lock);
3867
3733 if (!ret) 3868 if (!ret)
3734 return 0; 3869 return 0;
3735 3870
3736 if (block_rsv->refill_used) { 3871 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3872 if (!ret) {
3738 num_bytes, 0); 3873 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3874 return 0;
3752 } 3875 }
3753 3876
3754 return -ENOSPC; 3877 return ret;
3755} 3878}
3756 3879
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3880int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3906,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3906 u64 num_bytes;
3784 u64 meta_used; 3907 u64 meta_used;
3785 u64 data_used; 3908 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3909 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3910
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3911 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3912 spin_lock(&sinfo->lock);
@@ -3827,12 +3950,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 3950 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 3951 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 3952 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 3953 sinfo->bytes_may_use += num_bytes;
3831 } 3954 }
3832 3955
3833 if (block_rsv->reserved >= block_rsv->size) { 3956 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 3957 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 3958 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 3959 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 3960 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 3961 block_rsv->full = 1;
@@ -3848,16 +3971,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 3971
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3972 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 3973 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 3974
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3975 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 3976 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 3977 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 3978 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 3979 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 3980 fs_info->delayed_block_rsv.space_info = space_info;
3861 3981
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3982 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3983 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3985,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3985 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3986 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 3987
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 3988 update_global_block_rsv(fs_info);
3873} 3989}
3874 3990
@@ -3881,37 +3997,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 3997 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 3998 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 3999 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4000 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4001 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4002}
3916 4003
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4004void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4007,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4007 if (!trans->bytes_reserved)
3921 return; 4008 return;
3922 4009
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4010 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4011 trans->bytes_reserved = 0;
3927} 4012}
3928 4013
@@ -3964,33 +4049,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4049 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4050}
3966 4051
4052/**
4053 * drop_outstanding_extent - drop an outstanding extent
4054 * @inode: the inode we're dropping the extent for
4055 *
4056 * This is called when we are freeing up an outstanding extent, either called
4057 * after an error or after an extent is written. This will return the number of
4058 * reserved extents that need to be freed. This must be called with
4059 * BTRFS_I(inode)->lock held.
4060 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4061static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4062{
4063 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4064 unsigned dropped_extents = 0;
3970 4065
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4066 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4067 BTRFS_I(inode)->outstanding_extents--;
3974 4068
4069 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4070 BTRFS_I(inode)->delalloc_meta_reserved) {
4071 drop_inode_space = 1;
4072 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4073 }
4074
3975 /* 4075 /*
3976 * If we have more or the same amount of outsanding extents than we have 4076 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4077 * reserved then we need to leave the reserved extents count alone.
3978 */ 4078 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4079 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4080 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4081 return drop_inode_space;
3982 4082
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4083 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4084 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4085 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4086 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4087}
3990 4088
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4089/**
4090 * calc_csum_metadata_size - return the amount of metada space that must be
4091 * reserved/free'd for the given bytes.
4092 * @inode: the inode we're manipulating
4093 * @num_bytes: the number of bytes in question
4094 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4095 *
4096 * This adjusts the number of csum_bytes in the inode and then returns the
4097 * correct amount of metadata that must either be reserved or freed. We
4098 * calculate how many checksums we can fit into one leaf and then divide the
4099 * number of bytes that will need to be checksumed by this value to figure out
4100 * how many checksums will be required. If we are adding bytes then the number
4101 * may go up and we will return the number of additional bytes that must be
4102 * reserved. If it is going down we will return the number of bytes that must
4103 * be freed.
4104 *
4105 * This must be called with BTRFS_I(inode)->lock held.
4106 */
4107static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4108 int reserve)
3992{ 4109{
3993 return num_bytes >>= 3; 4110 struct btrfs_root *root = BTRFS_I(inode)->root;
4111 u64 csum_size;
4112 int num_csums_per_leaf;
4113 int num_csums;
4114 int old_csums;
4115
4116 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4117 BTRFS_I(inode)->csum_bytes == 0)
4118 return 0;
4119
4120 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4121 if (reserve)
4122 BTRFS_I(inode)->csum_bytes += num_bytes;
4123 else
4124 BTRFS_I(inode)->csum_bytes -= num_bytes;
4125 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4126 num_csums_per_leaf = (int)div64_u64(csum_size,
4127 sizeof(struct btrfs_csum_item) +
4128 sizeof(struct btrfs_disk_key));
4129 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4130 num_csums = num_csums + num_csums_per_leaf - 1;
4131 num_csums = num_csums / num_csums_per_leaf;
4132
4133 old_csums = old_csums + num_csums_per_leaf - 1;
4134 old_csums = old_csums / num_csums_per_leaf;
4135
4136 /* No change, no need to reserve more */
4137 if (old_csums == num_csums)
4138 return 0;
4139
4140 if (reserve)
4141 return btrfs_calc_trans_metadata_size(root,
4142 num_csums - old_csums);
4143
4144 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4145}
3995 4146
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4147int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4150,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4150 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4151 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4152 unsigned nr_extents = 0;
4153 int flush = 1;
4002 int ret; 4154 int ret;
4003 4155
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4156 if (btrfs_is_free_space_inode(root, inode))
4157 flush = 0;
4158
4159 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4160 schedule_timeout(1);
4006 4161
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4162 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4014,21 +4169,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4169 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4170 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents; 4171 BTRFS_I(inode)->reserved_extents += nr_extents;
4172 }
4017 4173
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4174 /*
4175 * Add an item to reserve for updating the inode when we complete the
4176 * delalloc io.
4177 */
4178 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4179 nr_extents++;
4180 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4019 } 4181 }
4182
4183 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4184 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4185 spin_unlock(&BTRFS_I(inode)->lock);
4021 4186
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4187 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4188 if (ret) {
4189 u64 to_free = 0;
4025 unsigned dropped; 4190 unsigned dropped;
4191
4192 spin_lock(&BTRFS_I(inode)->lock);
4193 dropped = drop_outstanding_extent(inode);
4194 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4195 spin_unlock(&BTRFS_I(inode)->lock);
4196 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4197
4026 /* 4198 /*
4027 * We don't need the return value since our reservation failed, 4199 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4200 * reservation, so if we have to free more than we would have
4201 * reserved from this reservation go ahead and release those
4202 * bytes.
4029 */ 4203 */
4030 dropped = drop_outstanding_extent(inode); 4204 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4205 if (to_free)
4206 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4207 return ret;
4033 } 4208 }
4034 4209
@@ -4037,6 +4212,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4212 return 0;
4038} 4213}
4039 4214
4215/**
4216 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4217 * @inode: the inode to release the reservation for
4218 * @num_bytes: the number of bytes we're releasing
4219 *
4220 * This will release the metadata reservation for an inode. This can be called
4221 * once we complete IO for a given set of bytes to release their metadata
4222 * reservations.
4223 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4224void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4225{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4226 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4228,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4228 unsigned dropped;
4045 4229
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4230 num_bytes = ALIGN(num_bytes, root->sectorsize);
4231 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4232 dropped = drop_outstanding_extent(inode);
4048 4233
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4234 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4235 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4236 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4237 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4238
@@ -4054,6 +4240,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4240 to_free);
4055} 4241}
4056 4242
4243/**
4244 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4245 * @inode: inode we're writing to
4246 * @num_bytes: the number of bytes we want to allocate
4247 *
4248 * This will do the following things
4249 *
4250 * o reserve space in the data space info for num_bytes
4251 * o reserve space in the metadata space info based on number of outstanding
4252 * extents and how much csums will be needed
4253 * o add to the inodes ->delalloc_bytes
4254 * o add it to the fs_info's delalloc inodes list.
4255 *
4256 * This will return 0 for success and -ENOSPC if there is no space left.
4257 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4258int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4259{
4059 int ret; 4260 int ret;
@@ -4071,6 +4272,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4272 return 0;
4072} 4273}
4073 4274
4275/**
4276 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4277 * @inode: inode we're releasing space for
4278 * @num_bytes: the number of bytes we want to free up
4279 *
4280 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4281 * called in the case that we don't need the metadata AND data reservations
4282 * anymore. So if there is an error or we insert an inline extent.
4283 *
4284 * This function will release the metadata space that was not used and will
4285 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4286 * list if there are no delalloc bytes left.
4287 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4288void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4289{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4290 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4304,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4304
4091 /* block accounting for super block */ 4305 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4306 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4307 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4308 if (alloc)
4095 old_val += num_bytes; 4309 old_val += num_bytes;
4096 else 4310 else
4097 old_val -= num_bytes; 4311 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4312 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4313 spin_unlock(&info->delalloc_lock);
4100 4314
4101 while (total) { 4315 while (total) {
@@ -4123,7 +4337,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4337 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4338 spin_lock(&cache->lock);
4125 4339
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4340 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4341 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4342 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4343
@@ -4135,7 +4349,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4349 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4350 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4351 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4352 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4353 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4354 spin_unlock(&cache->lock);
@@ -4187,7 +4400,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4400 if (reserved) {
4188 cache->reserved -= num_bytes; 4401 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4402 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4403 }
4192 spin_unlock(&cache->lock); 4404 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4405 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4427,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4427}
4216 4428
4217/* 4429/*
4218 * update size of reserved extents. this function may return -EAGAIN 4430 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4431 */
4432int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4433 struct btrfs_root *root,
4434 u64 bytenr, u64 num_bytes)
4435{
4436 struct btrfs_block_group_cache *cache;
4437
4438 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4439 BUG_ON(!cache);
4440
4441 /*
4442 * pull in the free space cache (if any) so that our pin
4443 * removes the free space from the cache. We have load_only set
4444 * to one because the slow code to read in the free extents does check
4445 * the pinned extents.
4446 */
4447 cache_block_group(cache, trans, root, 1);
4448
4449 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4450
4451 /* remove us from the free space cache (if we're there at all) */
4452 btrfs_remove_free_space(cache, bytenr, num_bytes);
4453 btrfs_put_block_group(cache);
4454 return 0;
4455}
4456
4457/**
4458 * btrfs_update_reserved_bytes - update the block_group and space info counters
4459 * @cache: The cache we are manipulating
4460 * @num_bytes: The number of bytes in question
4461 * @reserve: One of the reservation enums
4462 *
4463 * This is called by the allocator when it reserves space, or by somebody who is
4464 * freeing space that was never actually used on disk. For example if you
4465 * reserve some space for a new leaf in transaction A and before transaction A
4466 * commits you free that leaf, you call this with reserve set to 0 in order to
4467 * clear the reservation.
4468 *
4469 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4470 * ENOSPC accounting. For data we handle the reservation through clearing the
4471 * delalloc bits in the io_tree. We have to do this since we could end up
4472 * allocating less disk space for the amount of data we have reserved in the
4473 * case of compression.
4474 *
4475 * If this is a reservation and the block group has become read only we cannot
4476 * make the reservation and return -EAGAIN, otherwise this function always
4477 * succeeds.
4220 */ 4478 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4479static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4480 u64 num_bytes, int reserve)
4223{ 4481{
4482 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4483 int ret = 0;
4225 if (sinfo) { 4484 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4485 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4486 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4487 if (cache->ro) {
4248 ret = -EAGAIN; 4488 ret = -EAGAIN;
4249 } else { 4489 } else {
4250 if (reserve) 4490 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4491 space_info->bytes_reserved += num_bytes;
4252 else 4492 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4493 BUG_ON(space_info->bytes_may_use < num_bytes);
4494 space_info->bytes_may_use -= num_bytes;
4495 }
4254 } 4496 }
4255 spin_unlock(&cache->lock); 4497 } else {
4498 if (cache->ro)
4499 space_info->bytes_readonly += num_bytes;
4500 cache->reserved -= num_bytes;
4501 space_info->bytes_reserved -= num_bytes;
4502 space_info->reservation_progress++;
4256 } 4503 }
4504 spin_unlock(&cache->lock);
4505 spin_unlock(&space_info->lock);
4257 return ret; 4506 return ret;
4258} 4507}
4259 4508
@@ -4319,13 +4568,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4568 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4569 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4570 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4571 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4572 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4573 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4574 spin_unlock(&cache->space_info->lock);
4331 } 4575 }
@@ -4340,11 +4584,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4584{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4585 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4586 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4587 u64 start;
4346 u64 end; 4588 u64 end;
4347 int idx;
4348 int ret; 4589 int ret;
4349 4590
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4591 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4608,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4608 cond_resched();
4368 } 4609 }
4369 4610
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4611 return 0;
4395} 4612}
4396 4613
@@ -4668,7 +4885,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4885 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4886 u64 parent, int last_ref)
4670{ 4887{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4888 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4889 int ret;
4674 4890
@@ -4683,64 +4899,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4899 if (!last_ref)
4684 return; 4900 return;
4685 4901
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4902 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4903
4691 if (btrfs_header_generation(buf) == trans->transid) { 4904 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4905 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4906 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4907 if (!ret)
4695 goto pin; 4908 goto out;
4696 } 4909 }
4697 4910
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4911 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4912 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4913 goto out;
4701 } 4914 }
4702 4915
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4916 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4917
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4918 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4919 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4920 }
4745out: 4921out:
4746 /* 4922 /*
@@ -4883,10 +5059,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 5059 int last_ptr_loop = 0;
4884 int loop = 0; 5060 int loop = 0;
4885 int index = 0; 5061 int index = 0;
5062 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5063 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5064 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5065 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5066 bool failed_alloc = false;
4889 bool use_cluster = true; 5067 bool use_cluster = true;
5068 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5069 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5070 u64 ideal_cache_offset = 0;
4892 5071
@@ -4969,6 +5148,7 @@ ideal_cache:
4969 } 5148 }
4970 } 5149 }
4971search: 5150search:
5151 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5152 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5153 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5154 list) {
@@ -5177,6 +5357,8 @@ refill_cluster:
5177 failed_alloc = true; 5357 failed_alloc = true;
5178 goto have_block_group; 5358 goto have_block_group;
5179 } else if (!offset) { 5359 } else if (!offset) {
5360 if (!cached)
5361 have_caching_bg = true;
5180 goto loop; 5362 goto loop;
5181 } 5363 }
5182checks: 5364checks:
@@ -5202,8 +5384,8 @@ checks:
5202 search_start - offset); 5384 search_start - offset);
5203 BUG_ON(offset > search_start); 5385 BUG_ON(offset > search_start);
5204 5386
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5387 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5388 alloc_type);
5207 if (ret == -EAGAIN) { 5389 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5390 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5391 goto loop;
@@ -5227,6 +5409,9 @@ loop:
5227 } 5409 }
5228 up_read(&space_info->groups_sem); 5410 up_read(&space_info->groups_sem);
5229 5411
5412 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5413 goto search;
5414
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5415 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5416 goto search;
5232 5417
@@ -5325,7 +5510,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5510 int index = 0;
5326 5511
5327 spin_lock(&info->lock); 5512 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5513 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5514 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5515 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5516 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5517 info->bytes_readonly),
@@ -5411,7 +5597,8 @@ again:
5411 return ret; 5597 return ret;
5412} 5598}
5413 5599
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5600static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5601 u64 start, u64 len, int pin)
5415{ 5602{
5416 struct btrfs_block_group_cache *cache; 5603 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5604 int ret = 0;
@@ -5426,8 +5613,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5613 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5614 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5615
5429 btrfs_add_free_space(cache, start, len); 5616 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5617 pin_down_extent(root, cache, start, len, 1);
5618 else {
5619 btrfs_add_free_space(cache, start, len);
5620 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5621 }
5431 btrfs_put_block_group(cache); 5622 btrfs_put_block_group(cache);
5432 5623
5433 trace_btrfs_reserved_extent_free(root, start, len); 5624 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5626,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5626 return ret;
5436} 5627}
5437 5628
5629int btrfs_free_reserved_extent(struct btrfs_root *root,
5630 u64 start, u64 len)
5631{
5632 return __btrfs_free_reserved_extent(root, start, len, 0);
5633}
5634
5635int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5636 u64 start, u64 len)
5637{
5638 return __btrfs_free_reserved_extent(root, start, len, 1);
5639}
5640
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5641static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5642 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5643 u64 parent, u64 root_objectid,
@@ -5630,7 +5833,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5833 put_caching_control(caching_ctl);
5631 } 5834 }
5632 5835
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5836 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5837 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5838 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5839 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5840 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5891,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5891 block_rsv = get_block_rsv(trans, root);
5688 5892
5689 if (block_rsv->size == 0) { 5893 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5894 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5895 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5896 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5897 * the global reserve.
@@ -5708,13 +5911,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5911 if (!ret)
5709 return block_rsv; 5912 return block_rsv;
5710 if (ret) { 5913 if (ret) {
5711 WARN_ON(1); 5914 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5915 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5916 /*DEFAULT_RATELIMIT_BURST*/ 2);
5917 if (__ratelimit(&_rs)) {
5918 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5919 WARN_ON(1);
5920 }
5921 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5922 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5923 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5924 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5925 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6797,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6797 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6798
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6799 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6800 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6801 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6802 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6803 cache->ro = 1;
6602 ret = 0; 6804 ret = 0;
6603 } 6805 }
@@ -6964,7 +7166,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7166 struct btrfs_space_info,
6965 list); 7167 list);
6966 if (space_info->bytes_pinned > 0 || 7168 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7169 space_info->bytes_reserved > 0 ||
7170 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7171 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7172 dump_space_info(space_info, 0, 0);
6970 } 7173 }
@@ -7006,14 +7209,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7209 return -ENOMEM;
7007 path->reada = 1; 7210 path->reada = 1;
7008 7211
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7212 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7213 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7214 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7215 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7216 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7217 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7218
7018 while (1) { 7219 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7220 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7453,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7453 goto out;
7253 } 7454 }
7254 7455
7255 inode = lookup_free_space_inode(root, block_group, path); 7456 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7457 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7458 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7459 BUG_ON(ret);
@@ -7268,7 +7469,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7469 spin_unlock(&block_group->lock);
7269 } 7470 }
7270 /* One for our lookup ref */ 7471 /* One for our lookup ref */
7271 iput(inode); 7472 btrfs_add_delayed_iput(inode);
7272 } 7473 }
7273 7474
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7475 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7540,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7540 int mixed = 0;
7340 int ret; 7541 int ret;
7341 7542
7342 disk_super = &fs_info->super_copy; 7543 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7544 if (!btrfs_super_root(disk_super))
7344 return 1; 7545 return 1;
7345 7546
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f..1f87c4d0e7a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1735 if (ret == 0) { 2297 if (ret == 0) {
1736 uptodate = 2298 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
1812 else 2374 else
1813 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2379 bio_put(bio);
@@ -2076,16 +2639,16 @@ out:
2076} 2639}
2077 2640
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2080{ 2643{
2081 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2083 int ret; 2646 int ret;
2084 2647
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2649 &bio_flags);
2087 if (bio) 2650 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2652 return ret;
2090} 2653}
2091 2654
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2699 int compressed;
2137 int write_flags; 2700 int write_flags;
2138 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2139 2703
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2146 2710
2147 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2716 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2733
2167 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2168 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2169 delalloc_start = start; 2739 delalloc_start = start;
2170 delalloc_end = 0; 2740 delalloc_end = 0;
2171 page_started = 0; 2741 page_started = 0;
2172 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2174 /* 2744 /*
2175 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2421,10 +2991,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2422 * mapping 2992 * mapping
2423 */ 2993 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2426 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2428 3004
2429 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3006 unlock_page(page);
@@ -2926,7 +3502,7 @@ out:
2926 return ret; 3502 return ret;
2927} 3503}
2928 3504
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3506 unsigned long i)
2931{ 3507{
2932 struct page *p; 3508 struct page *p;
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3527 return p;
2952} 3528}
2953 3529
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3531{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3780 PAGECACHE_TAG_DIRTY);
3205 } 3781 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3782 spin_unlock_irq(&page->mapping->tree_lock);
3783 ClearPageError(page);
3207 unlock_page(page); 3784 unlock_page(page);
3208 } 3785 }
3209 return 0; 3786 return 0;
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3926}
3350 3927
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3928int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3929 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3930 get_extent_t *get_extent, int mirror_num)
3355{ 3931{
3356 unsigned long i; 3932 unsigned long i;
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3962 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3963 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3964 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3965 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3966 if (!trylock_page(page))
3391 goto unlock_exit; 3967 goto unlock_exit;
3392 } else { 3968 } else {
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4006 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4007 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4008
3433 if (ret || !wait) 4009 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4010 return ret;
3435 4011
3436 for (i = start_i; i < num_pages; i++) { 4012 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e792..feb9be0e23b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, u64 failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821bec..c7fb3a4247d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb..dafdfa059bf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
@@ -1821,7 +1832,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1821 switch (origin) { 1832 switch (origin) {
1822 case SEEK_END: 1833 case SEEK_END:
1823 case SEEK_CUR: 1834 case SEEK_CUR:
1824 offset = generic_file_llseek_unlocked(file, offset, origin); 1835 offset = generic_file_llseek(file, offset, origin);
1825 goto out; 1836 goto out;
1826 case SEEK_DATA: 1837 case SEEK_DATA:
1827 case SEEK_HOLE: 1838 case SEEK_HOLE:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d..181760f9d2a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,343 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 return 0;
355}
356
357static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
358{
359 u64 *val;
360
361 io_ctl_map_page(io_ctl, 1);
362
363 /*
364 * Skip the csum areas. If we don't check crcs then we just have a
365 * 64bit chunk at the front of the first page.
366 */
367 if (io_ctl->check_crcs) {
368 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
369 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
370 } else {
371 io_ctl->cur += sizeof(u64);
372 io_ctl->size -= sizeof(u64) * 2;
373 }
374
375 val = io_ctl->cur;
376 *val = cpu_to_le64(generation);
377 io_ctl->cur += sizeof(u64);
378}
379
380static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
381{
382 u64 *gen;
383
384 /*
385 * Skip the crc area. If we don't check crcs then we just have a 64bit
386 * chunk at the front of the first page.
387 */
388 if (io_ctl->check_crcs) {
389 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
390 io_ctl->size -= sizeof(u64) +
391 (sizeof(u32) * io_ctl->num_pages);
392 } else {
393 io_ctl->cur += sizeof(u64);
394 io_ctl->size -= sizeof(u64) * 2;
395 }
396
397 gen = io_ctl->cur;
398 if (le64_to_cpu(*gen) != generation) {
399 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
400 "(%Lu) does not match inode (%Lu)\n", *gen,
401 generation);
402 io_ctl_unmap_page(io_ctl);
403 return -EIO;
404 }
405 io_ctl->cur += sizeof(u64);
406 return 0;
407}
408
409static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
410{
411 u32 *tmp;
412 u32 crc = ~(u32)0;
413 unsigned offset = 0;
414
415 if (!io_ctl->check_crcs) {
416 io_ctl_unmap_page(io_ctl);
417 return;
418 }
419
420 if (index == 0)
421 offset = sizeof(u32) * io_ctl->num_pages;;
422
423 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
424 PAGE_CACHE_SIZE - offset);
425 btrfs_csum_final(crc, (char *)&crc);
426 io_ctl_unmap_page(io_ctl);
427 tmp = kmap(io_ctl->pages[0]);
428 tmp += index;
429 *tmp = crc;
430 kunmap(io_ctl->pages[0]);
431}
432
433static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
434{
435 u32 *tmp, val;
436 u32 crc = ~(u32)0;
437 unsigned offset = 0;
438
439 if (!io_ctl->check_crcs) {
440 io_ctl_map_page(io_ctl, 0);
441 return 0;
442 }
443
444 if (index == 0)
445 offset = sizeof(u32) * io_ctl->num_pages;
446
447 tmp = kmap(io_ctl->pages[0]);
448 tmp += index;
449 val = *tmp;
450 kunmap(io_ctl->pages[0]);
451
452 io_ctl_map_page(io_ctl, 0);
453 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
454 PAGE_CACHE_SIZE - offset);
455 btrfs_csum_final(crc, (char *)&crc);
456 if (val != crc) {
457 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
458 "space cache\n");
459 io_ctl_unmap_page(io_ctl);
460 return -EIO;
461 }
462
463 return 0;
464}
465
466static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
467 void *bitmap)
468{
469 struct btrfs_free_space_entry *entry;
470
471 if (!io_ctl->cur)
472 return -ENOSPC;
473
474 entry = io_ctl->cur;
475 entry->offset = cpu_to_le64(offset);
476 entry->bytes = cpu_to_le64(bytes);
477 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
478 BTRFS_FREE_SPACE_EXTENT;
479 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
480 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
481
482 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
483 return 0;
484
485 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
486
487 /* No more pages to map */
488 if (io_ctl->index >= io_ctl->num_pages)
489 return 0;
490
491 /* map the next page */
492 io_ctl_map_page(io_ctl, 1);
493 return 0;
494}
495
496static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
497{
498 if (!io_ctl->cur)
499 return -ENOSPC;
500
501 /*
502 * If we aren't at the start of the current page, unmap this one and
503 * map the next one if there is any left.
504 */
505 if (io_ctl->cur != io_ctl->orig) {
506 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
507 if (io_ctl->index >= io_ctl->num_pages)
508 return -ENOSPC;
509 io_ctl_map_page(io_ctl, 0);
510 }
511
512 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
513 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
514 if (io_ctl->index < io_ctl->num_pages)
515 io_ctl_map_page(io_ctl, 0);
516 return 0;
517}
518
519static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
520{
521 /*
522 * If we're not on the boundary we know we've modified the page and we
523 * need to crc the page.
524 */
525 if (io_ctl->cur != io_ctl->orig)
526 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
527 else
528 io_ctl_unmap_page(io_ctl);
529
530 while (io_ctl->index < io_ctl->num_pages) {
531 io_ctl_map_page(io_ctl, 1);
532 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
533 }
534}
535
536static int io_ctl_read_entry(struct io_ctl *io_ctl,
537 struct btrfs_free_space *entry, u8 *type)
538{
539 struct btrfs_free_space_entry *e;
540 int ret;
541
542 if (!io_ctl->cur) {
543 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
544 if (ret)
545 return ret;
546 }
547
548 e = io_ctl->cur;
549 entry->offset = le64_to_cpu(e->offset);
550 entry->bytes = le64_to_cpu(e->bytes);
551 *type = e->type;
552 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
553 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
554
555 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
556 return 0;
557
558 io_ctl_unmap_page(io_ctl);
559
560 return 0;
561}
562
563static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
564 struct btrfs_free_space *entry)
565{
566 int ret;
567
568 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
569 if (ret)
570 return ret;
571
572 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
573 io_ctl_unmap_page(io_ctl);
574
575 return 0;
576}
577
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 578int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 579 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 580 struct btrfs_path *path, u64 offset)
248{ 581{
249 struct btrfs_free_space_header *header; 582 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 583 struct extent_buffer *leaf;
251 struct page *page; 584 struct io_ctl io_ctl;
252 struct btrfs_key key; 585 struct btrfs_key key;
586 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 587 struct list_head bitmaps;
254 u64 num_entries; 588 u64 num_entries;
255 u64 num_bitmaps; 589 u64 num_bitmaps;
256 u64 generation; 590 u64 generation;
257 pgoff_t index = 0; 591 u8 type;
258 int ret = 0; 592 int ret = 0;
259 593
260 INIT_LIST_HEAD(&bitmaps); 594 INIT_LIST_HEAD(&bitmaps);
261 595
262 /* Nothing in the space cache, goodbye */ 596 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 597 if (!i_size_read(inode))
264 goto out; 598 return 0;
265 599
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 600 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 601 key.offset = offset;
@@ -269,11 +603,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 603
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 604 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 605 if (ret < 0)
272 goto out; 606 return 0;
273 else if (ret > 0) { 607 else if (ret > 0) {
274 btrfs_release_path(path); 608 btrfs_release_path(path);
275 ret = 0; 609 return 0;
276 goto out;
277 } 610 }
278 611
279 ret = -1; 612 ret = -1;
@@ -291,169 +624,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 624 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 625 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 626 (unsigned long long)generation);
294 goto out; 627 return 0;
295 } 628 }
296 629
297 if (!num_entries) 630 if (!num_entries)
298 goto out; 631 return 0;
299 632
633 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 634 ret = readahead_cache(inode);
301 if (ret) 635 if (ret)
302 goto out; 636 goto out;
303 637
304 while (1) { 638 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 639 if (ret)
306 struct btrfs_free_space *e; 640 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 641
311 if (!num_entries && !num_bitmaps) 642 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 643 if (ret)
644 goto free_cache;
313 645
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 646 ret = io_ctl_check_generation(&io_ctl, generation);
315 if (!page) 647 if (ret)
648 goto free_cache;
649
650 while (num_entries) {
651 e = kmem_cache_zalloc(btrfs_free_space_cachep,
652 GFP_NOFS);
653 if (!e)
316 goto free_cache; 654 goto free_cache;
317 655
318 if (!PageUptodate(page)) { 656 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 657 if (ret) {
320 lock_page(page); 658 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 659 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 660 }
329 addr = kmap(page);
330 661
331 if (index == 0) { 662 if (!e->bytes) {
332 u64 *gen; 663 kmem_cache_free(btrfs_free_space_cachep, e);
664 goto free_cache;
665 }
333 666
334 /* 667 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 668 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 669 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 670 spin_unlock(&ctl->tree_lock);
338 */ 671 if (ret) {
339 addr += sizeof(u64); 672 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 673 "free space cache, dumping\n");
341 674 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 675 goto free_cache;
353 } 676 }
354 addr += sizeof(u64); 677 } else {
355 offset += sizeof(u64); 678 BUG_ON(!num_bitmaps);
356 } 679 num_bitmaps--;
357 entry = addr; 680 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 681 if (!e->bitmap) {
359 while (1) { 682 kmem_cache_free(
360 if (!num_entries) 683 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 684 goto free_cache;
371 } 685 }
372 686 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 687 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 688 ctl->total_bitmaps++;
375 if (!e->bytes) { 689 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 690 spin_unlock(&ctl->tree_lock);
691 if (ret) {
692 printk(KERN_ERR "Duplicate entries in "
693 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 694 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 695 goto free_cache;
381 } 696 }
382 697 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 698 }
428 699
429 /* 700 num_entries--;
430 * We read an entry out of this page, we need to move on to the 701 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 702
438 /* 703 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 704
440 * the bitmap entries are added to the cache. 705 /*
441 */ 706 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 707 * the bitmap entries are added to the cache.
708 */
709 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 710 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 711 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 712 if (ret)
446 num_bitmaps--; 713 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 714 }
452 715
716 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 717 ret = 1;
454out: 718out:
719 io_ctl_free(&io_ctl);
455 return ret; 720 return ret;
456free_cache: 721free_cache:
722 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 723 __btrfs_remove_free_space_cache(ctl);
458 goto out; 724 goto out;
459} 725}
@@ -465,7 +731,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 731 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 732 struct inode *inode;
467 struct btrfs_path *path; 733 struct btrfs_path *path;
468 int ret; 734 int ret = 0;
469 bool matched; 735 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 736 u64 used = btrfs_block_group_used(&block_group->item);
471 737
@@ -497,6 +763,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 763 return 0;
498 } 764 }
499 765
766 /* We may have converted the inode and made the cache invalid. */
767 spin_lock(&block_group->lock);
768 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
769 spin_unlock(&block_group->lock);
770 goto out;
771 }
772 spin_unlock(&block_group->lock);
773
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 774 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 775 path, block_group->key.objectid);
502 btrfs_free_path(path); 776 btrfs_free_path(path);
@@ -530,6 +804,19 @@ out:
530 return ret; 804 return ret;
531} 805}
532 806
807/**
808 * __btrfs_write_out_cache - write out cached info to an inode
809 * @root - the root the inode belongs to
810 * @ctl - the free space cache we are going to write out
811 * @block_group - the block_group for this cache if it belongs to a block_group
812 * @trans - the trans handle
813 * @path - the path to use
814 * @offset - the offset for the key we'll insert
815 *
816 * This function writes out a free space cache struct to disk for quick recovery
817 * on mount. This will return 0 if it was successfull in writing the cache out,
818 * and -1 if it was not.
819 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 820int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 821 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 822 struct btrfs_block_group_cache *block_group,
@@ -540,42 +827,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 827 struct extent_buffer *leaf;
541 struct rb_node *node; 828 struct rb_node *node;
542 struct list_head *pos, *n; 829 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 830 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 831 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 832 struct extent_io_tree *unpin = NULL;
833 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 834 struct list_head bitmap_list;
549 struct btrfs_key key; 835 struct btrfs_key key;
550 u64 start, end, len; 836 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 837 int entries = 0;
555 int bitmaps = 0; 838 int bitmaps = 0;
556 int ret = -1; 839 int ret;
557 bool next_page = false; 840 int err = -1;
558 bool out_of_space = false;
559 841
560 INIT_LIST_HEAD(&bitmap_list); 842 INIT_LIST_HEAD(&bitmap_list);
561 843
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 844 if (!i_size_read(inode))
567 return -1; 845 return -1;
568 846
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 847 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 848
580 /* Get the cluster for this block_group if it exists */ 849 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 850 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +858,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 858 */
590 unpin = root->fs_info->pinned_extents; 859 unpin = root->fs_info->pinned_extents;
591 860
592 /* 861 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 862 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 863
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 864 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 865 0, &cached_state, GFP_NOFS);
618 866
@@ -623,189 +871,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 871 if (block_group)
624 start = block_group->key.objectid; 872 start = block_group->key.objectid;
625 873
626 /* Write out the extent entries */ 874 node = rb_first(&ctl->free_space_offset);
627 do { 875 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 876 node = rb_first(&cluster->root);
629 void *addr, *orig; 877 cluster = NULL;
630 unsigned long offset = 0; 878 }
631 879
632 next_page = false; 880 /* Make sure we can fit our crcs into the first page */
881 if (io_ctl.check_crcs &&
882 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
883 WARN_ON(1);
884 goto out_nospc;
885 }
633 886
634 if (index >= num_pages) { 887 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 888
639 page = pages[index]; 889 /* Write out the extent entries */
890 while (node) {
891 struct btrfs_free_space *e;
640 892
641 orig = addr = kmap(page); 893 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 894 entries++;
643 u64 *gen;
644 895
645 /* 896 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 897 e->bitmap);
647 * make sure that old kernels who aren't aware of this 898 if (ret)
648 * format will be sure to discard the cache. 899 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 900
653 gen = addr; 901 if (e->bitmap) {
654 *gen = trans->transid; 902 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 903 bitmaps++;
656 offset += sizeof(u64);
657 } 904 }
658 entry = addr; 905 node = rb_next(node);
659 906 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 907 node = rb_first(&cluster->root);
661 while (node && !next_page) { 908 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 909 }
910 }
687 911
688 /* 912 /*
689 * We want to add any pinned extents to our free space cache 913 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 914 * so we don't leak the space
691 */ 915 */
692 while (block_group && !next_page && 916 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 917 block_group->key.offset)) {
694 block_group->key.offset)) { 918 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 919 EXTENT_DIRTY);
696 EXTENT_DIRTY); 920 if (ret) {
697 if (ret) { 921 ret = 0;
698 ret = 0; 922 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 923 }
723 924
724 /* Generate bogus crc value */ 925 /* This pinned extent is out of our range */
725 if (index == 0) { 926 if (start >= block_group->key.objectid +
726 u32 *tmp; 927 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 928 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 929
735 kunmap(page); 930 len = block_group->key.objectid +
931 block_group->key.offset - start;
932 len = min(len, end + 1 - start);
736 933
737 bytes += PAGE_CACHE_SIZE; 934 entries++;
935 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
936 if (ret)
937 goto out_nospc;
738 938
739 index++; 939 start = end + 1;
740 } while (node || next_page); 940 }
741 941
742 /* Write out the bitmaps */ 942 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 943 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 944 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 945 list_entry(pos, struct btrfs_free_space, list);
747 946
748 if (index >= num_pages) { 947 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 948 if (ret)
750 break; 949 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 950 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 951 }
771 952
772 /* Zero out the rest of the pages just to make sure */ 953 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 954 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775
776 page = pages[index];
777 addr = kmap(page);
778 memset(addr, 0, PAGE_CACHE_SIZE);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783 955
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 956 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
785 bytes, &cached_state); 957 0, i_size_read(inode), &cached_state);
786 btrfs_drop_pages(pages, num_pages); 958 io_ctl_drop_pages(&io_ctl);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 959 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 960 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 961
790 if (ret) { 962 if (ret)
791 ret = 0;
792 goto out; 963 goto out;
793 }
794 964
795 BTRFS_I(inode)->generation = trans->transid;
796 965
797 filemap_write_and_wait(inode->i_mapping); 966 ret = filemap_write_and_wait(inode->i_mapping);
967 if (ret)
968 goto out;
798 969
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 970 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 971 key.offset = offset;
801 key.type = 0; 972 key.type = 0;
802 973
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 974 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 975 if (ret < 0) {
805 ret = -1; 976 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 977 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 978 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 979 goto out;
810 } 980 }
811 leaf = path->nodes[0]; 981 leaf = path->nodes[0];
@@ -816,15 +986,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 986 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 987 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 988 found_key.offset != offset) {
819 ret = -1; 989 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 990 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 991 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 992 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 993 btrfs_release_path(path);
825 goto out; 994 goto out;
826 } 995 }
827 } 996 }
997
998 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 999 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1000 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1001 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1004,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1004 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1005 btrfs_release_path(path);
835 1006
836 ret = 1; 1007 err = 0;
837
838out: 1008out:
839 kfree(pages); 1009 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1010 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1011 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1012 BTRFS_I(inode)->generation = 0;
843 } 1013 }
844 btrfs_update_inode(trans, root, inode); 1014 btrfs_update_inode(trans, root, inode);
845 return ret; 1015 return err;
1016
1017out_nospc:
1018 list_for_each_safe(pos, n, &bitmap_list) {
1019 struct btrfs_free_space *entry =
1020 list_entry(pos, struct btrfs_free_space, list);
1021 list_del_init(&entry->list);
1022 }
1023 io_ctl_drop_pages(&io_ctl);
1024 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1025 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1026 goto out;
846} 1027}
847 1028
848int btrfs_write_out_cache(struct btrfs_root *root, 1029int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1050,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1050
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1051 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1052 path, block_group->key.objectid);
872 if (ret < 0) { 1053 if (ret) {
873 spin_lock(&block_group->lock); 1054 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1055 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1056 spin_unlock(&block_group->lock);
876 ret = 0; 1057 ret = 0;
877 1058#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1059 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1060 "for block group %llu\n", block_group->key.objectid);
1061#endif
880 } 1062 }
881 1063
882 iput(inode); 1064 iput(inode);
@@ -1701,6 +1883,7 @@ again:
1701 ctl->total_bitmaps--; 1883 ctl->total_bitmaps--;
1702 } 1884 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1885 kmem_cache_free(btrfs_free_space_cachep, info);
1886 ret = 0;
1704 goto out_lock; 1887 goto out_lock;
1705 } 1888 }
1706 1889
@@ -1708,7 +1891,8 @@ again:
1708 unlink_free_space(ctl, info); 1891 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1892 info->offset += bytes;
1710 info->bytes -= bytes; 1893 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1894 ret = link_free_space(ctl, info);
1895 WARN_ON(ret);
1712 goto out_lock; 1896 goto out_lock;
1713 } 1897 }
1714 1898
@@ -2472,9 +2656,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2656 spin_unlock(&ctl->tree_lock);
2473 2657
2474 if (bytes >= minlen) { 2658 if (bytes >= minlen) {
2475 int update_ret; 2659 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2660 int update = 0;
2477 bytes, 1, 1); 2661
2662 space_info = block_group->space_info;
2663 spin_lock(&space_info->lock);
2664 spin_lock(&block_group->lock);
2665 if (!block_group->ro) {
2666 block_group->reserved += bytes;
2667 space_info->bytes_reserved += bytes;
2668 update = 1;
2669 }
2670 spin_unlock(&block_group->lock);
2671 spin_unlock(&space_info->lock);
2478 2672
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2673 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2674 start,
@@ -2482,9 +2676,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2676 &actually_trimmed);
2483 2677
2484 btrfs_add_free_space(block_group, start, bytes); 2678 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2679 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2680 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2681 spin_lock(&block_group->lock);
2682 if (block_group->ro)
2683 space_info->bytes_readonly += bytes;
2684 block_group->reserved -= bytes;
2685 space_info->bytes_reserved -= bytes;
2686 spin_unlock(&space_info->lock);
2687 spin_unlock(&block_group->lock);
2688 }
2488 2689
2489 if (ret) 2690 if (ret)
2490 break; 2691 break;
@@ -2643,9 +2844,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2844 return 0;
2644 2845
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2846 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2847 if (ret) {
2848 btrfs_delalloc_release_metadata(inode, inode->i_size);
2849#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2850 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2851 "for root %llu\n", root->root_key.objectid);
2852#endif
2853 }
2649 2854
2650 iput(inode); 2855 iput(inode);
2651 return ret; 2856 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa87..f8962a957d6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a..116ab67a06d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 93 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 94 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 95 unsigned long *nr_written, int unlock);
96static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode);
96 98
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 99static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 100 struct inode *inode, struct inode *dir,
@@ -393,7 +395,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 395 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 396 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 397 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 398 if (!pages) {
399 /* just bail out to the uncompressed code */
400 goto cont;
401 }
397 402
398 if (BTRFS_I(inode)->force_compress) 403 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 404 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +429,7 @@ again:
424 will_compress = 1; 429 will_compress = 1;
425 } 430 }
426 } 431 }
432cont:
427 if (start == 0) { 433 if (start == 0) {
428 trans = btrfs_join_transaction(root); 434 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 435 BUG_ON(IS_ERR(trans));
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 826 }
821 827
822 BUG_ON(disk_num_bytes > 828 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 829 btrfs_super_total_bytes(root->fs_info->super_copy));
824 830
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 831 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 832 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1743 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1744 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1745 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1746 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1747 BUG_ON(ret);
1742 } 1748 }
1743 goto out; 1749 goto out;
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1793
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1794 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1795 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1796 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1797 BUG_ON(ret);
1792 } 1798 }
1793 ret = 0; 1799 ret = 0;
1794out: 1800out:
1795 if (nolock) { 1801 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1802 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1803 if (trans) {
1804 if (nolock)
1805 btrfs_end_transaction_nolock(trans, root);
1806 else
1801 btrfs_end_transaction(trans, root); 1807 btrfs_end_transaction(trans, root);
1802 } 1808 }
1803 1809
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1825}
1820 1826
1821/* 1827/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1828 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1829 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1830 * extent_io.c will try to find good copies for us.
1969 */ 1831 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1832static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1833 struct extent_state *state)
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1873
2012 kunmap_atomic(kaddr, KM_USER0); 1874 kunmap_atomic(kaddr, KM_USER0);
2013good: 1875good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1876 return 0;
2019 1877
2020zeroit: 1878zeroit:
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1937 up_read(&root->fs_info->cleanup_work_sem);
2080} 1938}
2081 1939
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1940enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1941 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1942 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2022 }
2248 spin_unlock(&root->orphan_lock); 2023 spin_unlock(&root->orphan_lock);
2249 2024
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2025 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2026 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2027 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2088 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2089 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2090 struct inode *inode;
2091 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2092 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2093
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2094 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2140 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2141 * offset of the orphan item.
2369 */ 2142 */
2143
2144 if (found_key.offset == last_objectid) {
2145 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2146 "stopping orphan cleanup\n");
2147 ret = -EINVAL;
2148 goto out;
2149 }
2150
2151 last_objectid = found_key.offset;
2152
2370 found_key.objectid = found_key.offset; 2153 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2154 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2155 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2157 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2158 if (ret && ret != -ESTALE)
2376 goto out; 2159 goto out;
2377 }
2378 2160
2379 /* 2161 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2162 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2163 * kill the orphan item.
2382 */ 2164 */
2383 spin_lock(&root->orphan_lock); 2165 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2166 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2167 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2168 ret = PTR_ERR(trans);
2397 goto out; 2169 goto out;
2398 } 2170 }
2399 btrfs_orphan_del(trans, inode); 2171 ret = btrfs_del_orphan_item(trans, root,
2172 found_key.objectid);
2173 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2174 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2175 continue;
2403 } 2176 }
2404 2177
2178 /*
2179 * add this inode to the orphan list so btrfs_orphan_del does
2180 * the proper thing when we hit it
2181 */
2182 spin_lock(&root->orphan_lock);
2183 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2184 spin_unlock(&root->orphan_lock);
2185
2405 /* if we have links, this was a truncate, lets do that */ 2186 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2187 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2188 if (!S_ISREG(inode->i_mode)) {
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2201 if (ret)
2421 goto out; 2202 goto out;
2422 } 2203 }
2204 /* release the path since we're done with it */
2205 btrfs_release_path(path);
2206
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2207 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2208
2425 if (root->orphan_block_rsv) 2209 if (root->orphan_block_rsv)
@@ -2534,7 +2318,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
2534 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2318 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2535 struct btrfs_inode_item); 2319 struct btrfs_inode_item);
2536 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2320 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2537 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2321 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2538 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2322 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
2539 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2323 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2540 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2324 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2431/*
2648 * copy everything in the in-memory inode into the btree. 2432 * copy everything in the in-memory inode into the btree.
2649 */ 2433 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2434static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2435 struct btrfs_root *root, struct inode *inode)
2652{ 2436{
2653 struct btrfs_inode_item *inode_item; 2437 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2439 struct extent_buffer *leaf;
2656 int ret; 2440 int ret;
2657 2441
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2442 path = btrfs_alloc_path();
2674 if (!path) 2443 if (!path)
2675 return -ENOMEM; 2444 return -ENOMEM;
@@ -2698,6 +2467,43 @@ failed:
2698} 2467}
2699 2468
2700/* 2469/*
2470 * copy everything in the in-memory inode into the btree.
2471 */
2472noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2473 struct btrfs_root *root, struct inode *inode)
2474{
2475 int ret;
2476
2477 /*
2478 * If the inode is a free space inode, we can deadlock during commit
2479 * if we put it into the delayed code.
2480 *
2481 * The data relocation inode should also be directly updated
2482 * without delay
2483 */
2484 if (!btrfs_is_free_space_inode(root, inode)
2485 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2486 ret = btrfs_delayed_update_inode(trans, root, inode);
2487 if (!ret)
2488 btrfs_set_inode_last_trans(trans, inode);
2489 return ret;
2490 }
2491
2492 return btrfs_update_inode_item(trans, root, inode);
2493}
2494
2495static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, struct inode *inode)
2497{
2498 int ret;
2499
2500 ret = btrfs_update_inode(trans, root, inode);
2501 if (ret == -ENOSPC)
2502 return btrfs_update_inode_item(trans, root, inode);
2503 return ret;
2504}
2505
2506/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2507 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2508 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2509 * also drops the back refs in the inode to the directory
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2641 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2642 u64 dir_ino = btrfs_ino(dir);
2837 2643
2838 trans = btrfs_start_transaction(root, 10); 2644 /*
2645 * 1 for the possible orphan item
2646 * 1 for the dir item
2647 * 1 for the dir index
2648 * 1 for the inode ref
2649 * 1 for the inode ref in the tree log
2650 * 2 for the dir entries in the log
2651 * 1 for the inode
2652 */
2653 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2654 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2655 return trans;
2841 2656
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2673 return ERR_PTR(-ENOMEM);
2859 } 2674 }
2860 2675
2861 trans = btrfs_start_transaction(root, 0); 2676 /* 1 for the orphan item */
2677 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2678 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2679 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2680 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2779 err = 0;
2964out: 2780out:
2965 btrfs_free_path(path); 2781 btrfs_free_path(path);
2782 /* Migrate the orphan reservation over */
2783 if (!err)
2784 err = btrfs_block_rsv_migrate(trans->block_rsv,
2785 &root->fs_info->global_block_rsv,
2786 trans->bytes_reserved);
2787
2966 if (err) { 2788 if (err) {
2967 btrfs_end_transaction(trans, root); 2789 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2790 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2799 struct btrfs_root *root)
2978{ 2800{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2801 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2802 btrfs_block_rsv_release(root, trans->block_rsv,
2803 trans->bytes_reserved);
2804 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2805 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2806 root->fs_info->enospc_unlink = 0;
2982 } 2807 }
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3193 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3194 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3195 struct page *page;
3196 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3197 int ret = 0;
3372 u64 page_start; 3198 u64 page_start;
3373 u64 page_end; 3199 u64 page_end;
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3206
3381 ret = -ENOMEM; 3207 ret = -ENOMEM;
3382again: 3208again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3209 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3210 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3211 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3212 goto out;
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3439{
3614 struct btrfs_trans_handle *trans; 3440 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3441 struct btrfs_root *root = BTRFS_I(inode)->root;
3442 struct btrfs_block_rsv *rsv, *global_rsv;
3443 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3444 unsigned long nr;
3617 int ret; 3445 int ret;
3618 3446
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3468 goto no_delete;
3641 } 3469 }
3642 3470
3471 rsv = btrfs_alloc_block_rsv(root);
3472 if (!rsv) {
3473 btrfs_orphan_del(NULL, inode);
3474 goto no_delete;
3475 }
3476 rsv->size = min_size;
3477 global_rsv = &root->fs_info->global_block_rsv;
3478
3643 btrfs_i_size_write(inode, 0); 3479 btrfs_i_size_write(inode, 0);
3644 3480
3481 /*
3482 * This is a bit simpler than btrfs_truncate since
3483 *
3484 * 1) We've already reserved our space for our orphan item in the
3485 * unlink.
3486 * 2) We're going to delete the inode item, so we don't need to update
3487 * it at all.
3488 *
3489 * So we just need to reserve some slack space in case we add bytes when
3490 * doing the truncate.
3491 */
3645 while (1) { 3492 while (1) {
3646 trans = btrfs_join_transaction(root); 3493 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3494
3648 trans->block_rsv = root->orphan_block_rsv; 3495 /*
3496 * Try and steal from the global reserve since we will
3497 * likely not use this space anyway, we want to try as
3498 * hard as possible to get this to work.
3499 */
3500 if (ret)
3501 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3502
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3503 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3504 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3505 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3506 btrfs_orphan_del(NULL, inode);
3656 continue; 3507 btrfs_free_block_rsv(root, rsv);
3508 goto no_delete;
3657 } 3509 }
3658 3510
3511 trans = btrfs_start_transaction(root, 0);
3512 if (IS_ERR(trans)) {
3513 btrfs_orphan_del(NULL, inode);
3514 btrfs_free_block_rsv(root, rsv);
3515 goto no_delete;
3516 }
3517
3518 trans->block_rsv = rsv;
3519
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3520 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3521 if (ret != -EAGAIN)
3661 break; 3522 break;
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3525 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3526 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3527 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3528 }
3669 3529
3530 btrfs_free_block_rsv(root, rsv);
3531
3670 if (ret == 0) { 3532 if (ret == 0) {
3533 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3534 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3535 BUG_ON(ret);
3673 } 3536 }
3674 3537
3538 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3539 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3540 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3541 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5659,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5659 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5660 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5661 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5662 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5663 goto out;
5801 } 5664 }
5802 5665
@@ -5834,7 +5697,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5697 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5698 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5699 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5700 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5701 ret = 0;
5839out_unlock: 5702out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5703 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6152{
6290 struct extent_io_tree *tree; 6153 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6154 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6155 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6156}
6294 6157
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6158static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6404 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6405 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6406 u64 mask = root->sectorsize - 1;
6407 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6408
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6409 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6410 if (ret)
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6452 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6453 if (!rsv)
6590 return -ENOMEM; 6454 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6455 rsv->size = min_size;
6592 6456
6457 /*
6458 * 1 for the truncate slack space
6459 * 1 for the orphan item we're going to add
6460 * 1 for the orphan item deletion
6461 * 1 for updating the inode.
6462 */
6593 trans = btrfs_start_transaction(root, 4); 6463 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6464 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6465 err = PTR_ERR(trans);
6596 goto out; 6466 goto out;
6597 } 6467 }
6598 6468
6599 /* 6469 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6470 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6471 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6472 BUG_ON(ret);
6605 6473
6606 ret = btrfs_orphan_add(trans, inode); 6474 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6477 goto out;
6610 } 6478 }
6611 6479
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6480 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6481 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6482 * but that is only tested during the last file release. That
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6498 btrfs_add_ordered_operation(trans, root, inode);
6646 6499
6647 while (1) { 6500 while (1) {
6501 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6502 if (ret) {
6503 /*
6504 * This can only happen with the original transaction we
6505 * started above, every other time we shouldn't have a
6506 * transaction started yet.
6507 */
6508 if (ret == -EAGAIN)
6509 goto end_trans;
6510 err = ret;
6511 break;
6512 }
6513
6648 if (!trans) { 6514 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6515 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6517 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6518 err = PTR_ERR(trans);
6652 goto out; 6519 goto out;
6653 } 6520 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6521 }
6661 6522
6523 trans->block_rsv = rsv;
6524
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6525 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6526 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6527 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6536 err = ret;
6674 break; 6537 break;
6675 } 6538 }
6676 6539end_trans:
6677 nr = trans->blocks_used; 6540 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6541 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6542 trans = NULL;
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6556 ret = btrfs_orphan_del(NULL, inode);
6694 } 6557 }
6695 6558
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6559 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6560 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6561 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6562 if (ret && !err)
6563 err = ret;
6700 6564
6701 nr = trans->blocks_used; 6565 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6566 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6567 btrfs_btree_balance_dirty(root, nr);
6568 }
6704 6569
6705out: 6570out:
6706 btrfs_free_block_rsv(root, rsv); 6571 btrfs_free_block_rsv(root, rsv);
@@ -6728,7 +6593,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6728 inode->i_op = &btrfs_dir_inode_operations; 6593 inode->i_op = &btrfs_dir_inode_operations;
6729 inode->i_fop = &btrfs_dir_file_operations; 6594 inode->i_fop = &btrfs_dir_file_operations;
6730 6595
6731 inode->i_nlink = 1; 6596 set_nlink(inode, 1);
6732 btrfs_i_size_write(inode, 0); 6597 btrfs_i_size_write(inode, 0);
6733 6598
6734 err = btrfs_update_inode(trans, new_root, inode); 6599 err = btrfs_update_inode(trans, new_root, inode);
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6620 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6621 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6622 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6623 ei->disk_i_size = 0;
6760 ei->flags = 0; 6624 ei->flags = 0;
6625 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6626 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6627 ei->last_unlink_trans = 0;
6763 6628
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6634 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6635 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6636 ei->in_defrag = 0;
6637 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6638 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6639
6774 ei->delayed_node = NULL; 6640 ei->delayed_node = NULL;
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6669 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6670 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6671 WARN_ON(BTRFS_I(inode)->reserved_extents);
6672 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6673 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6674
6807 /* 6675 /*
6808 * This can happen where we create an inode, but somebody else also 6676 * This can happen where we create an inode, but somebody else also
@@ -7420,7 +7288,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7288 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7289 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7290 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7291 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7292 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7293 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba..4a34c472f12 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr;
2935 }
2936
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
2938 if (ret) {
2939 ret = -EFAULT;
2940 goto out;
2941 }
2942
2943out:
2944 btrfs_free_path(path);
2945 free_ipath(ipath);
2946 kfree(ipa);
2947
2948 return ret;
2949}
2950
2951static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2952{
2953 struct btrfs_data_container *inodes = ctx;
2954 const size_t c = 3 * sizeof(u64);
2955
2956 if (inodes->bytes_left >= c) {
2957 inodes->bytes_left -= c;
2958 inodes->val[inodes->elem_cnt] = inum;
2959 inodes->val[inodes->elem_cnt + 1] = offset;
2960 inodes->val[inodes->elem_cnt + 2] = root;
2961 inodes->elem_cnt += 3;
2962 } else {
2963 inodes->bytes_missing += c - inodes->bytes_left;
2964 inodes->bytes_left = 0;
2965 inodes->elem_missed += 3;
2966 }
2967
2968 return 0;
2969}
2970
2971static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2972 void __user *arg)
2973{
2974 int ret = 0;
2975 int size;
2976 u64 extent_offset;
2977 struct btrfs_ioctl_logical_ino_args *loi;
2978 struct btrfs_data_container *inodes = NULL;
2979 struct btrfs_path *path = NULL;
2980 struct btrfs_key key;
2981
2982 if (!capable(CAP_SYS_ADMIN))
2983 return -EPERM;
2984
2985 loi = memdup_user(arg, sizeof(*loi));
2986 if (IS_ERR(loi)) {
2987 ret = PTR_ERR(loi);
2988 loi = NULL;
2989 goto out;
2990 }
2991
2992 path = btrfs_alloc_path();
2993 if (!path) {
2994 ret = -ENOMEM;
2995 goto out;
2996 }
2997
2998 size = min_t(u32, loi->size, 4096);
2999 inodes = init_data_container(size);
3000 if (IS_ERR(inodes)) {
3001 ret = PTR_ERR(inodes);
3002 inodes = NULL;
3003 goto out;
3004 }
3005
3006 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3007
3008 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3009 ret = -ENOENT;
3010 if (ret < 0)
3011 goto out;
3012
3013 extent_offset = loi->logical - key.objectid;
3014 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3015 extent_offset, build_ino_list, inodes);
3016
3017 if (ret < 0)
3018 goto out;
3019
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
3021 if (ret)
3022 ret = -EFAULT;
3023
3024out:
3025 btrfs_free_path(path);
3026 kfree(inodes);
3027 kfree(loi);
3028
3029 return ret;
3030}
3031
2867long btrfs_ioctl(struct file *file, unsigned int 3032long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3033 cmd, unsigned long arg)
2869{ 3034{
@@ -2921,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3086 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3087 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3088 return btrfs_ioctl_ino_lookup(file, argp);
3089 case BTRFS_IOC_INO_PATHS:
3090 return btrfs_ioctl_ino_to_path(root, argp);
3091 case BTRFS_IOC_LOGICAL_INO:
3092 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3093 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3094 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3095 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb..252ae9915de 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e..f38e452486b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 00000000000..2373b39a132
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273..dff29d5e151 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2957 ra, NULL, index,
2957 last_index + 1 - index); 2958 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2959 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2960 mask);
2960 if (!page) { 2961 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2962 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2963 PAGE_CACHE_SIZE);
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3324 }
3324 3325
3325 key.objectid = ref_objectid; 3326 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3327 key.type = BTRFS_EXTENT_DATA_KEY;
3328 if (ref_offset > ((u64)-1 << 32))
3329 key.offset = 0;
3330 else
3331 key.offset = ref_offset;
3328 3332
3329 path->search_commit_root = 1; 3333 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3334 path->skip_locking = 1;
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3649 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3650 * is no reservation in transaction handle.
3647 */ 3651 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3652 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3653 rc->extent_root->nodesize * 256);
3650 if (ret) 3654 if (ret)
3651 return ret; 3655 return ret;
3652 3656
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3657 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3658 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3659 rc->extents_found = 0;
@@ -3777,8 +3778,7 @@ restart:
3777 } 3778 }
3778 } 3779 }
3779 3780
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3781 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3782 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3783 if (ret != -EAGAIN) {
3784 err = ret; 3784 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5..f4190f22edf 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,361 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
198/* 545/*
199 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 549 * one may be bad
203 */ 550 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
206 if (sbio->err) { 558 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 562 return 0;
212 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
213 } 569 }
214 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
215 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
216} 577}
217 578
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 617 u64 length;
256 int i; 618 int i;
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 621
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 631 /*
263 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
265 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
266 */ 639 */
267 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
268 } 648 }
269 649
270 length = PAGE_SIZE; 650 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 652 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 654 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 656 (unsigned long long)logical);
277 WARN_ON(1); 657 WARN_ON(1);
658 kfree(bbio);
278 return; 659 return;
279 } 660 }
280 661
281 if (multi->num_stripes == 1) 662 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 663 /* there aren't any replicas */
283 goto uncorrectable; 664 goto uncorrectable;
284 665
285 /* 666 /*
286 * first find a good copy 667 * first find a good copy
287 */ 668 */
288 for (i = 0; i < multi->num_stripes; ++i) { 669 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 670 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 671 continue;
291 672
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 673 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 674 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 675 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 676 /* I/O-error, this is not a good copy */
296 continue; 677 continue;
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 680 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 681 break;
301 } 682 }
302 if (i == multi->num_stripes) 683 if (i == bbio->num_stripes)
303 goto uncorrectable; 684 goto uncorrectable;
304 685
305 if (!sdev->readonly) { 686 if (!sdev->readonly) {
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 695 }
315 } 696 }
316 697
317 kfree(multi); 698 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 699 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 700 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 701 spin_unlock(&sdev->stat_lock);
321 702
322 if (printk_ratelimit()) 703 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 704 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 705 return;
326 706
327uncorrectable: 707uncorrectable:
328 kfree(multi); 708 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 709 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 710 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 711 spin_unlock(&sdev->stat_lock);
332 712
333 if (printk_ratelimit()) 713 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 714 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 715}
337 716
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 717static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 761 int ret;
383 762
384 if (sbio->err) { 763 if (sbio->err) {
764 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 765 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 766 ret |= scrub_recheck_error(sbio, i);
767 if (!ret) {
768 spin_lock(&sdev->stat_lock);
769 ++sdev->stat.unverified_errors;
770 spin_unlock(&sdev->stat_lock);
771 }
387 772
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 773 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 774 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 781 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 782 bi->bv_len = PAGE_SIZE;
398 } 783 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 784 goto out;
404 } 785 }
405 for (i = 0; i < sbio->count; ++i) { 786 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 801 WARN_ON(1);
421 } 802 }
422 kunmap_atomic(buffer, KM_USER0); 803 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 804 if (ret) {
424 scrub_recheck_error(sbio, i); 805 ret = scrub_recheck_error(sbio, i);
806 if (!ret) {
807 spin_lock(&sdev->stat_lock);
808 ++sdev->stat.unverified_errors;
809 spin_unlock(&sdev->stat_lock);
810 }
811 }
425 } 812 }
426 813
427out: 814out:
@@ -557,57 +944,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 944static int scrub_submit(struct scrub_dev *sdev)
558{ 945{
559 struct scrub_bio *sbio; 946 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 947
563 if (sdev->curr == -1) 948 if (sdev->curr == -1)
564 return 0; 949 return 0;
565 950
566 sbio = sdev->bios[sdev->curr]; 951 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 952 sbio->err = 0;
593 sdev->curr = -1; 953 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 954 atomic_inc(&sdev->in_flight);
595 955
596 submit_bio(READ, bio); 956 submit_bio(READ, sbio->bio);
597 957
598 return 0; 958 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 959}
605 960
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 961static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 962 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 963 u8 *csum, int force)
609{ 964{
610 struct scrub_bio *sbio; 965 struct scrub_bio *sbio;
966 struct page *page;
967 int ret;
611 968
612again: 969again:
613 /* 970 /*
@@ -628,12 +985,22 @@ again:
628 } 985 }
629 sbio = sdev->bios[sdev->curr]; 986 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 987 if (sbio->count == 0) {
988 struct bio *bio;
989
631 sbio->physical = physical; 990 sbio->physical = physical;
632 sbio->logical = logical; 991 sbio->logical = logical;
992 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
993 if (!bio)
994 return -ENOMEM;
995
996 bio->bi_private = sbio;
997 bio->bi_end_io = scrub_bio_end_io;
998 bio->bi_bdev = sdev->dev->bdev;
999 bio->bi_sector = sbio->physical >> 9;
1000 sbio->err = 0;
1001 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1002 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1003 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1004 ret = scrub_submit(sdev);
638 if (ret) 1005 if (ret)
639 return ret; 1006 return ret;
@@ -643,6 +1010,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1010 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1011 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1012 sbio->spag[sbio->count].mirror_num = mirror_num;
1013
1014 page = alloc_page(GFP_NOFS);
1015 if (!page)
1016 return -ENOMEM;
1017
1018 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1019 if (!ret) {
1020 __free_page(page);
1021 ret = scrub_submit(sdev);
1022 if (ret)
1023 return ret;
1024 goto again;
1025 }
1026
646 if (csum) { 1027 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1028 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1029 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1082,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1082
702/* scrub extent tries to collect up to 64 kB for each bio */ 1083/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1084static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1085 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1086{
706 int ret; 1087 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1088 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1122,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1122 int slot;
742 int i; 1123 int i;
743 u64 nstripes; 1124 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1125 struct extent_buffer *l;
746 struct btrfs_key key; 1126 struct btrfs_key key;
747 u64 physical; 1127 u64 physical;
748 u64 logical; 1128 u64 logical;
749 u64 generation; 1129 u64 generation;
750 u64 mirror_num; 1130 int mirror_num;
1131 struct reada_control *reada1;
1132 struct reada_control *reada2;
1133 struct btrfs_key key_start;
1134 struct btrfs_key key_end;
751 1135
752 u64 increment = map->stripe_len; 1136 u64 increment = map->stripe_len;
753 u64 offset; 1137 u64 offset;
@@ -758,102 +1142,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1142 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1143 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1144 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1145 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1146 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1147 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1148 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1149 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1150 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1152 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1153 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1154 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1155 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1156 mirror_num = num % map->num_stripes + 1;
773 } else { 1157 } else {
774 increment = map->stripe_len; 1158 increment = map->stripe_len;
775 mirror_num = 0; 1159 mirror_num = 1;
776 } 1160 }
777 1161
778 path = btrfs_alloc_path(); 1162 path = btrfs_alloc_path();
779 if (!path) 1163 if (!path)
780 return -ENOMEM; 1164 return -ENOMEM;
781 1165
782 path->reada = 2;
783 path->search_commit_root = 1; 1166 path->search_commit_root = 1;
784 path->skip_locking = 1; 1167 path->skip_locking = 1;
785 1168
786 /* 1169 /*
787 * find all extents for each stripe and just read them to get 1170 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1171 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1172 * to not hold off transaction commits
790 */ 1173 */
791 logical = base + offset; 1174 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1175
817 break; 1176 wait_event(sdev->list_wait,
818 } 1177 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1178 atomic_inc(&fs_info->scrubs_paused);
1179 wake_up(&fs_info->scrub_pause_wait);
820 1180
821 if (key.objectid >= logical + map->stripe_len) 1181 /* FIXME it might be better to start readahead at commit root */
822 break; 1182 key_start.objectid = logical;
1183 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1184 key_start.offset = (u64)0;
1185 key_end.objectid = base + offset + nstripes * increment;
1186 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1187 key_end.offset = (u64)0;
1188 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1189
1190 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1191 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1192 key_start.offset = logical;
1193 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1194 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1195 key_end.offset = base + offset + nstripes * increment;
1196 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1197
1198 if (!IS_ERR(reada1))
1199 btrfs_reada_wait(reada1);
1200 if (!IS_ERR(reada2))
1201 btrfs_reada_wait(reada2);
823 1202
824 path->slots[0]++; 1203 mutex_lock(&fs_info->scrub_lock);
825 } 1204 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1205 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1206 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1207 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1208 mutex_lock(&fs_info->scrub_lock);
830 } 1209 }
1210 atomic_dec(&fs_info->scrubs_paused);
1211 mutex_unlock(&fs_info->scrub_lock);
1212 wake_up(&fs_info->scrub_pause_wait);
831 1213
832 /* 1214 /*
833 * collect all data csums for the stripe to avoid seeking during 1215 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1216 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1217 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1218 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1219
847 logical += increment;
848 cond_resched();
849 }
850 /* 1220 /*
851 * now find all extents for each stripe and scrub them 1221 * now find all extents for each stripe and scrub them
852 */ 1222 */
853 logical = base + offset + start_stripe * increment; 1223 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1224 physical = map->stripes[num].physical;
855 ret = 0; 1225 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1226 for (i = 0; i < nstripes; ++i) {
857 /* 1227 /*
858 * canceled? 1228 * canceled?
859 */ 1229 */
@@ -882,11 +1252,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1252 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1253 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1254 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1255 }
889 1256
1257 ret = btrfs_lookup_csums_range(csum_root, logical,
1258 logical + map->stripe_len - 1,
1259 &sdev->csum_list, 1);
1260 if (ret)
1261 goto out;
1262
890 key.objectid = logical; 1263 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1264 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1265 key.offset = (u64)0;
@@ -982,7 +1355,6 @@ next:
982 1355
983out: 1356out:
984 blk_finish_plug(&plug); 1357 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1358 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1359 return ret < 0 ? ret : 0;
988} 1360}
@@ -1253,10 +1625,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1625 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1626
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1627 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1628 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1629 wake_up(&fs_info->scrub_pause_wait);
1259 1630
1631 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1632
1260 if (progress) 1633 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1634 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1635
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d..8bd9d6d0e07 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 448 token = match_token(p, tokens, args);
431 switch (token) { 449 switch (token) {
432 case Opt_subvol: 450 case Opt_subvol:
451 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 452 *subvol_name = match_strdup(&args[0]);
434 break; 453 break;
435 case Opt_subvolid: 454 case Opt_subvolid:
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 476 }
458 break; 477 break;
459 case Opt_device: 478 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 479 device_name = match_strdup(&args[0]);
480 if (!device_name) {
481 error = -ENOMEM;
482 goto out;
483 }
484 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 485 flags, holder, fs_devices);
486 kfree(device_name);
462 if (error) 487 if (error)
463 goto out_free_opts; 488 goto out;
464 break; 489 break;
465 default: 490 default:
466 break; 491 break;
467 } 492 }
468 } 493 }
469 494
470 out_free_opts: 495out:
471 kfree(orig); 496 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 497 return error;
484} 498}
485 499
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 506 struct btrfs_path *path;
493 struct btrfs_key location; 507 struct btrfs_key location;
494 struct inode *inode; 508 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 509 u64 dir_id;
497 int new = 0; 510 int new = 0;
498 511
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 530 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 531 * to mount.
519 */ 532 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 535 if (IS_ERR(di)) {
523 btrfs_free_path(path); 536 btrfs_free_path(path);
@@ -566,29 +579,7 @@ setup_root:
566 return dget(sb->s_root); 579 return dget(sb->s_root);
567 } 580 }
568 581
569 if (new) { 582 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 583}
593 584
594static int btrfs_fill_super(struct super_block *sb, 585static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 710 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 711 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 712 seq_puts(seq, ",space_cache");
713 else
714 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 715 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 716 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 717 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +746,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 746 return set_anon_super(s, data);
754} 747}
755 748
749/*
750 * subvolumes are identified by ino 256
751 */
752static inline int is_subvolume_inode(struct inode *inode)
753{
754 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
755 return 1;
756 return 0;
757}
758
759/*
760 * This will strip out the subvol=%s argument for an argument string and add
761 * subvolid=0 to make sure we get the actual tree root for path walking to the
762 * subvol we want.
763 */
764static char *setup_root_args(char *args)
765{
766 unsigned copied = 0;
767 unsigned len = strlen(args) + 2;
768 char *pos;
769 char *ret;
770
771 /*
772 * We need the same args as before, but minus
773 *
774 * subvol=a
775 *
776 * and add
777 *
778 * subvolid=0
779 *
780 * which is a difference of 2 characters, so we allocate strlen(args) +
781 * 2 characters.
782 */
783 ret = kzalloc(len * sizeof(char), GFP_NOFS);
784 if (!ret)
785 return NULL;
786 pos = strstr(args, "subvol=");
787
788 /* This shouldn't happen, but just in case.. */
789 if (!pos) {
790 kfree(ret);
791 return NULL;
792 }
793
794 /*
795 * The subvol=<> arg is not at the front of the string, copy everybody
796 * up to that into ret.
797 */
798 if (pos != args) {
799 *pos = '\0';
800 strcpy(ret, args);
801 copied += strlen(args);
802 pos++;
803 }
804
805 strncpy(ret + copied, "subvolid=0", len - copied);
806
807 /* Length of subvolid=0 */
808 copied += 10;
809
810 /*
811 * If there is no , after the subvol= option then we know there's no
812 * other options and we can just return.
813 */
814 pos = strchr(pos, ',');
815 if (!pos)
816 return ret;
817
818 /* Copy the rest of the arguments into our buffer */
819 strncpy(ret + copied, pos, len - copied);
820 copied += strlen(pos);
821
822 return ret;
823}
824
825static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data)
827{
828 struct super_block *s;
829 struct dentry *root;
830 struct vfsmount *mnt;
831 struct mnt_namespace *ns_private;
832 char *newargs;
833 struct path path;
834 int error;
835
836 newargs = setup_root_args(data);
837 if (!newargs)
838 return ERR_PTR(-ENOMEM);
839 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
840 newargs);
841 kfree(newargs);
842 if (IS_ERR(mnt))
843 return ERR_CAST(mnt);
844
845 ns_private = create_mnt_ns(mnt);
846 if (IS_ERR(ns_private)) {
847 mntput(mnt);
848 return ERR_CAST(ns_private);
849 }
850
851 /*
852 * This will trigger the automount of the subvol so we can just
853 * drop the mnt we have here and return the dentry that we
854 * found.
855 */
856 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
857 LOOKUP_FOLLOW, &path);
858 put_mnt_ns(ns_private);
859 if (error)
860 return ERR_PTR(error);
861
862 if (!is_subvolume_inode(path.dentry->d_inode)) {
863 path_put(&path);
864 mntput(mnt);
865 error = -EINVAL;
866 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
867 subvol_name);
868 return ERR_PTR(-EINVAL);
869 }
870
871 /* Get a ref to the sb and the dentry we found and return it */
872 s = path.mnt->mnt_sb;
873 atomic_inc(&s->s_active);
874 root = dget(path.dentry);
875 path_put(&path);
876 down_write(&s->s_umount);
877
878 return root;
879}
756 880
757/* 881/*
758 * Find a superblock for the given device / mount point. 882 * Find a superblock for the given device / mount point.
@@ -767,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 891 struct super_block *s;
768 struct dentry *root; 892 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 893 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 894 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 895 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 896 char *subvol_name = NULL;
@@ -781,21 +904,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 904 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 905 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 906 &subvol_rootid, &fs_devices);
784 if (error) 907 if (error) {
908 kfree(subvol_name);
785 return ERR_PTR(error); 909 return ERR_PTR(error);
910 }
786 911
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 912 if (subvol_name) {
788 if (error) 913 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 914 kfree(subvol_name);
915 return root;
916 }
790 917
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 918 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 919 if (error)
793 goto error_free_subvol_name; 920 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 921
800 /* 922 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 923 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +926,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 926 * then open_ctree will properly initialize everything later.
805 */ 927 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 928 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 929 if (!fs_info)
808 if (!fs_info || !tree_root) { 930 return ERR_PTR(-ENOMEM);
931
932 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
933 if (!fs_info->tree_root) {
809 error = -ENOMEM; 934 error = -ENOMEM;
810 goto error_close_devices; 935 goto error_fs_info;
811 } 936 }
812 fs_info->tree_root = tree_root; 937 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 938 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 939
940 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
941 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
942 if (!fs_info->super_copy || !fs_info->super_for_commit) {
943 error = -ENOMEM;
944 goto error_fs_info;
945 }
946
947 error = btrfs_open_devices(fs_devices, mode, fs_type);
948 if (error)
949 goto error_fs_info;
950
951 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
952 error = -EACCES;
953 goto error_close_devices;
954 }
815 955
816 bdev = fs_devices->latest_bdev; 956 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 957 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 958 fs_info->tree_root);
819 goto error_s; 959 if (IS_ERR(s)) {
960 error = PTR_ERR(s);
961 goto error_close_devices;
962 }
820 963
821 if (s->s_root) { 964 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 965 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +969,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 969 }
827 970
828 btrfs_close_devices(fs_devices); 971 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 972 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 973 } else {
832 char b[BDEVNAME_SIZE]; 974 char b[BDEVNAME_SIZE];
833 975
834 s->s_flags = flags | MS_NOSEC; 976 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 977 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
978 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 979 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 980 flags & MS_SILENT ? 1 : 0);
838 if (error) { 981 if (error) {
839 deactivate_locked_super(s); 982 deactivate_locked_super(s);
840 goto error_free_subvol_name; 983 return ERR_PTR(error);
841 } 984 }
842 985
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 986 s->s_flags |= MS_ACTIVE;
845 } 987 }
846 988
847 /* if they gave us a subvolume name bind mount into that */ 989 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 990 if (IS_ERR(root)) {
849 struct dentry *new_root; 991 deactivate_locked_super(s);
850 992 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 993 }
886 994
887 kfree(subvol_name);
888 return root; 995 return root;
889 996
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 997error_close_devices:
893 btrfs_close_devices(fs_devices); 998 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 999error_fs_info:
895 kfree(tree_root); 1000 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 1001 return ERR_PTR(error);
899} 1002}
900 1003
@@ -919,7 +1022,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1022 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1023 return -EACCES;
921 1024
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1025 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1026 return -EINVAL;
924 1027
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1028 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1188,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1188static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1189{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1190 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1191 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1192 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1193 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1194 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a15..6a0574e923b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632
633 while (1) {
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
635 mark);
636 if (ret)
637 break;
638 612
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
640 while (start <= end) { 614 EXTENT_NEED_WAIT)) {
641 index = start >> PAGE_CACHE_SHIFT; 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 616 err = filemap_fdatawait_range(mapping, start, end);
643 page = find_get_page(btree_inode->i_mapping, index); 617 if (err)
644 if (!page) 618 werr = err;
645 continue; 619 cond_resched();
646 if (PageDirty(page)) { 620 start = end + 1;
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,11 +880,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 880 }
912 881
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 882 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 883
916 if (to_reserve > 0) { 884 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 885 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 886 to_reserve);
919 if (ret) { 887 if (ret) {
920 pending->error = ret; 888 pending->error = ret;
921 goto fail; 889 goto fail;
@@ -1002,7 +970,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 970 BUG_ON(IS_ERR(pending->snap));
1003 971
1004 btrfs_reloc_post_snapshot(trans, pending); 972 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 973fail:
1007 kfree(new_root_item); 974 kfree(new_root_item);
1008 trans->block_rsv = rsv; 975 trans->block_rsv = rsv;
@@ -1032,7 +999,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 999 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1000 struct btrfs_super_block *super;
1034 1001
1035 super = &root->fs_info->super_copy; 1002 super = root->fs_info->super_copy;
1036 1003
1037 root_item = &root->fs_info->chunk_root->root_item; 1004 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1005 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1010,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1010 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1011 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1012 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1013 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1014 super->cache_generation = root_item->generation;
1048} 1015}
1049 1016
@@ -1168,14 +1135,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1135
1169 btrfs_run_ordered_operations(root, 0); 1136 btrfs_run_ordered_operations(root, 0);
1170 1137
1138 btrfs_trans_release_metadata(trans, root);
1139 trans->block_rsv = NULL;
1140
1171 /* make a pass through all the delayed refs we have so far 1141 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1142 * any runnings procs may add more while we are here
1173 */ 1143 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1144 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1145 BUG_ON(ret);
1176 1146
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1147 cur_trans = trans->transaction;
1180 /* 1148 /*
1181 * set the flushing flag so procs in this transaction have to 1149 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1309 update_super_roots(root);
1342 1310
1343 if (!root->fs_info->log_root_recovering) { 1311 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1312 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1313 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1314 }
1347 1315
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1316 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1317 sizeof(*root->fs_info->super_copy));
1350 1318
1351 trans->transaction->blocked = 0; 1319 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1320 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca06..3568374d419 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1030,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1030 } 1031 }
1031 btrfs_release_path(path); 1032 btrfs_release_path(path);
1032 if (nlink != inode->i_nlink) { 1033 if (nlink != inode->i_nlink) {
1033 inode->i_nlink = nlink; 1034 set_nlink(inode, nlink);
1034 btrfs_update_inode(trans, root, inode); 1035 btrfs_update_inode(trans, root, inode);
1035 } 1036 }
1036 BTRFS_I(inode)->index_cnt = (u64)-1; 1037 BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da6..c37433d3cd8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 999 key.objectid = device->devid;
994 key.offset = start; 1000 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1001 key.type = BTRFS_DEV_EXTENT_KEY;
996 1002again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1004 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1005 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1012 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1013 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1014 btrfs_dev_extent_length(leaf, extent) < start);
1015 key = found_key;
1016 btrfs_release_path(path);
1017 goto again;
1009 } else if (ret == 0) { 1018 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1020 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1022 }
1014 BUG_ON(ret); 1023 BUG_ON(ret);
1015 1024
1016 if (device->bytes_used > 0) 1025 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1026 u64 len = btrfs_dev_extent_length(leaf, extent);
1027 device->bytes_used -= len;
1028 spin_lock(&root->fs_info->free_chunk_lock);
1029 root->fs_info->free_chunk_space += len;
1030 spin_unlock(&root->fs_info->free_chunk_lock);
1031 }
1018 ret = btrfs_del_item(trans, root, path); 1032 ret = btrfs_del_item(trans, root, path);
1019 1033
1020out: 1034out:
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1370 if (ret)
1357 goto error_undo; 1371 goto error_undo;
1358 1372
1373 spin_lock(&root->fs_info->free_chunk_lock);
1374 root->fs_info->free_chunk_space = device->total_bytes -
1375 device->bytes_used;
1376 spin_unlock(&root->fs_info->free_chunk_lock);
1377
1359 device->in_fs_metadata = 0; 1378 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1379 btrfs_scrub_cancel_dev(root, device);
1361 1380
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1406 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1407 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1408
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1409 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1410 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1411
1393 if (cur_devices->open_devices == 0) { 1412 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1413 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1469 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1470 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1471 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1472 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1473 struct btrfs_device *device;
1455 u64 super_flags; 1474 u64 super_flags;
1456 1475
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1710 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1711 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1712
1713 spin_lock(&root->fs_info->free_chunk_lock);
1714 root->fs_info->free_chunk_space += device->total_bytes;
1715 spin_unlock(&root->fs_info->free_chunk_lock);
1716
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1717 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1718 root->fs_info->fs_devices->rotating = 1;
1696 1719
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1720 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1721 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1722 total_bytes + device->total_bytes);
1700 1723
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1724 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1725 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1726 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1727 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1728
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1813 struct btrfs_device *device, u64 new_size)
1791{ 1814{
1792 struct btrfs_super_block *super_copy = 1815 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1816 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1817 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1818 u64 diff = new_size - device->total_bytes;
1796 1819
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1872static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1873 chunk_offset)
1851{ 1874{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1875 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1876 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1877 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1878 u8 *ptr;
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2198 bool retried = false;
2176 struct extent_buffer *l; 2199 struct extent_buffer *l;
2177 struct btrfs_key key; 2200 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2201 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2202 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2203 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2204 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2215 lock_chunks(root);
2193 2216
2194 device->total_bytes = new_size; 2217 device->total_bytes = new_size;
2195 if (device->writeable) 2218 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2219 device->fs_devices->total_rw_bytes -= diff;
2220 spin_lock(&root->fs_info->free_chunk_lock);
2221 root->fs_info->free_chunk_space -= diff;
2222 spin_unlock(&root->fs_info->free_chunk_lock);
2223 }
2197 unlock_chunks(root); 2224 unlock_chunks(root);
2198 2225
2199again: 2226again:
@@ -2257,6 +2284,9 @@ again:
2257 device->total_bytes = old_size; 2284 device->total_bytes = old_size;
2258 if (device->writeable) 2285 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2286 device->fs_devices->total_rw_bytes += diff;
2287 spin_lock(&root->fs_info->free_chunk_lock);
2288 root->fs_info->free_chunk_space += diff;
2289 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2290 unlock_chunks(root);
2261 goto done; 2291 goto done;
2262 } 2292 }
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2322 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2323 struct btrfs_chunk *chunk, int item_size)
2294{ 2324{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2325 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2326 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2327 u32 array_size;
2298 u8 *ptr; 2328 u8 *ptr;
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2645 index++;
2616 } 2646 }
2617 2647
2648 spin_lock(&extent_root->fs_info->free_chunk_lock);
2649 extent_root->fs_info->free_chunk_space -= (stripe_size *
2650 map->num_stripes);
2651 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2652
2618 index = 0; 2653 index = 0;
2619 stripe = &chunk->stripe; 2654 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2655 while (index < map->num_stripes) {
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2883
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2884static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2885 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2886 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2887 int mirror_num)
2853{ 2888{
2854 struct extent_map *em; 2889 struct extent_map *em;
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2901 int i;
2867 int num_stripes; 2902 int num_stripes;
2868 int max_errors = 0; 2903 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2904 struct btrfs_bio *bbio = NULL;
2870 2905
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2906 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2907 stripes_allocated = 1;
2873again: 2908again:
2874 if (multi_ret) { 2909 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2910 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2911 GFP_NOFS);
2877 if (!multi) 2912 if (!bbio)
2878 return -ENOMEM; 2913 return -ENOMEM;
2879 2914
2880 atomic_set(&multi->error, 0); 2915 atomic_set(&bbio->error, 0);
2881 } 2916 }
2882 2917
2883 read_lock(&em_tree->lock); 2918 read_lock(&em_tree->lock);
@@ -2898,7 +2933,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2933 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2934 mirror_num = 0;
2900 2935
2901 /* if our multi bio struct is too small, back off and try again */ 2936 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2937 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2938 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2939 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2952,11 @@ again:
2917 stripes_required = map->num_stripes; 2952 stripes_required = map->num_stripes;
2918 } 2953 }
2919 } 2954 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2955 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2956 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2957 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2958 free_extent_map(em);
2924 kfree(multi); 2959 kfree(bbio);
2925 goto again; 2960 goto again;
2926 } 2961 }
2927 stripe_nr = offset; 2962 stripe_nr = offset;
@@ -2950,7 +2985,7 @@ again:
2950 *length = em->len - offset; 2985 *length = em->len - offset;
2951 } 2986 }
2952 2987
2953 if (!multi_ret) 2988 if (!bbio_ret)
2954 goto out; 2989 goto out;
2955 2990
2956 num_stripes = 1; 2991 num_stripes = 1;
@@ -2975,13 +3010,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3010 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3011 map->num_stripes,
2977 current->pid % map->num_stripes); 3012 current->pid % map->num_stripes);
3013 mirror_num = stripe_index + 1;
2978 } 3014 }
2979 3015
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3016 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3017 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3018 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3019 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3020 stripe_index = mirror_num - 1;
3021 } else {
3022 mirror_num = 1;
3023 }
2985 3024
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3026 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3040,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3040 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3041 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3042 current->pid % map->sub_stripes);
3043 mirror_num = stripe_index + 1;
3004 } 3044 }
3005 } else { 3045 } else {
3006 /* 3046 /*
@@ -3009,15 +3049,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3049 * stripe_index is the number of our device in the stripe array
3010 */ 3050 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3051 stripe_index = do_div(stripe_nr, map->num_stripes);
3052 mirror_num = stripe_index + 1;
3012 } 3053 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3054 BUG_ON(stripe_index >= map->num_stripes);
3014 3055
3015 if (rw & REQ_DISCARD) { 3056 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3057 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3058 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3059 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3060 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3061 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3062
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3063 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3064 u64 stripes;
@@ -3038,16 +3079,16 @@ again:
3038 } 3079 }
3039 stripes = stripe_nr_end - 1 - j; 3080 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3081 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3082 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3083 (stripes - stripe_nr + 1);
3043 3084
3044 if (i == 0) { 3085 if (i == 0) {
3045 multi->stripes[i].length -= 3086 bbio->stripes[i].length -=
3046 stripe_offset; 3087 stripe_offset;
3047 stripe_offset = 0; 3088 stripe_offset = 0;
3048 } 3089 }
3049 if (stripe_index == last_stripe) 3090 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3091 bbio->stripes[i].length -=
3051 stripe_end_offset; 3092 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3093 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3094 u64 stripes;
@@ -3072,11 +3113,11 @@ again:
3072 } 3113 }
3073 stripes = stripe_nr_end - 1 - j; 3114 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3115 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3116 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3117 (stripes - stripe_nr + 1);
3077 3118
3078 if (i < map->sub_stripes) { 3119 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3120 bbio->stripes[i].length -=
3080 stripe_offset; 3121 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3122 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3123 stripe_offset = 0;
@@ -3084,11 +3125,11 @@ again:
3084 if (stripe_index >= last_stripe && 3125 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3126 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3127 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3128 bbio->stripes[i].length -=
3088 stripe_end_offset; 3129 stripe_end_offset;
3089 } 3130 }
3090 } else 3131 } else
3091 multi->stripes[i].length = *length; 3132 bbio->stripes[i].length = *length;
3092 3133
3093 stripe_index++; 3134 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3135 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3140,20 @@ again:
3099 } 3140 }
3100 } else { 3141 } else {
3101 for (i = 0; i < num_stripes; i++) { 3142 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3143 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3144 map->stripes[stripe_index].physical +
3104 stripe_offset + 3145 stripe_offset +
3105 stripe_nr * map->stripe_len; 3146 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3147 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3148 map->stripes[stripe_index].dev;
3108 stripe_index++; 3149 stripe_index++;
3109 } 3150 }
3110 } 3151 }
3111 if (multi_ret) { 3152 if (bbio_ret) {
3112 *multi_ret = multi; 3153 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3154 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3155 bbio->max_errors = max_errors;
3156 bbio->mirror_num = mirror_num;
3115 } 3157 }
3116out: 3158out:
3117 free_extent_map(em); 3159 free_extent_map(em);
@@ -3120,9 +3162,9 @@ out:
3120 3162
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3163int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3164 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3165 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3166{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3167 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3168 mirror_num);
3127} 3169}
3128 3170
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3233 return 0;
3192} 3234}
3193 3235
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3236static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3237{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3238 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3239 int is_orig_bio = 0;
3198 3240
3199 if (err) 3241 if (err)
3200 atomic_inc(&multi->error); 3242 atomic_inc(&bbio->error);
3201 3243
3202 if (bio == multi->orig_bio) 3244 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3245 is_orig_bio = 1;
3204 3246
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3247 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3248 if (!is_orig_bio) {
3207 bio_put(bio); 3249 bio_put(bio);
3208 bio = multi->orig_bio; 3250 bio = bbio->orig_bio;
3209 } 3251 }
3210 bio->bi_private = multi->private; 3252 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3253 bio->bi_end_io = bbio->end_io;
3254 bio->bi_bdev = (struct block_device *)
3255 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3256 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3257 * beyond the tolerance of the multi-bio
3214 */ 3258 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3259 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3260 err = -EIO;
3217 } else if (err) { 3261 } else if (err) {
3218 /* 3262 /*
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3266 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3267 err = 0;
3224 } 3268 }
3225 kfree(multi); 3269 kfree(bbio);
3226 3270
3227 bio_endio(bio, err); 3271 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3272 } else if (!is_orig_bio) {
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3346 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3347 u64 length = 0;
3304 u64 map_length; 3348 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3349 int ret;
3307 int dev_nr = 0; 3350 int dev_nr = 0;
3308 int total_devs = 1; 3351 int total_devs = 1;
3352 struct btrfs_bio *bbio = NULL;
3309 3353
3310 length = bio->bi_size; 3354 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3355 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3356 map_length = length;
3313 3357
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3358 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3359 mirror_num);
3316 BUG_ON(ret); 3360 BUG_ON(ret);
3317 3361
3318 total_devs = multi->num_stripes; 3362 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3363 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3364 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3365 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3367 (unsigned long long)map_length);
3324 BUG(); 3368 BUG();
3325 } 3369 }
3326 multi->end_io = first_bio->bi_end_io; 3370
3327 multi->private = first_bio->bi_private; 3371 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3372 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3373 bbio->end_io = first_bio->bi_end_io;
3374 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3375
3331 while (dev_nr < total_devs) { 3376 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3377 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3378 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3379 BUG_ON(!bio);
3335 BUG_ON(!bio); 3380 } else {
3336 } else { 3381 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3382 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3383 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3384 bio->bi_end_io = btrfs_end_bio;
3385 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3386 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3387 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3388 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3389 "(%s id %llu), size=%u\n", rw,
3390 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3391 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3392 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3393 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3394 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3401 }
3355 dev_nr++; 3402 dev_nr++;
3356 } 3403 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3404 return 0;
3360} 3405}
3361 3406
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3661 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3662 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3663 device->in_fs_metadata = 1;
3619 if (device->writeable) 3664 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3665 device->fs_devices->total_rw_bytes += device->total_bytes;
3666 spin_lock(&root->fs_info->free_chunk_lock);
3667 root->fs_info->free_chunk_space += device->total_bytes -
3668 device->bytes_used;
3669 spin_unlock(&root->fs_info->free_chunk_lock);
3670 }
3621 ret = 0; 3671 ret = 0;
3622 return ret; 3672 return ret;
3623} 3673}
3624 3674
3625int btrfs_read_sys_array(struct btrfs_root *root) 3675int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3676{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3677 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3678 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3679 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3680 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e17..ab5b1c49f35 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
95}; 103};
96 104
97struct btrfs_fs_devices { 105struct btrfs_fs_devices {
@@ -136,7 +144,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 144 u64 length; /* only used for discard mappings */
137}; 145};
138 146
139struct btrfs_multi_bio { 147struct btrfs_bio;
148typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
149
150struct btrfs_bio {
140 atomic_t stripes_pending; 151 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 152 bio_end_io_t *end_io;
142 struct bio *orig_bio; 153 struct bio *orig_bio;
@@ -144,6 +155,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 155 atomic_t error;
145 int max_errors; 156 int max_errors;
146 int num_stripes; 157 int num_stripes;
158 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 159 struct btrfs_bio_stripe stripes[];
148}; 160};
149 161
@@ -171,7 +183,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 183int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 184 u64 end, u64 *length);
173 185
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 186#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
176 188
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 189int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 192 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 193int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 194 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 195 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 196int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 197 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 198 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1a..3848b04e310 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade..19d8eb7fdc8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
213 * elsewhere, don't buffer_error if we had some unmapped buffers 213 * elsewhere, don't buffer_error if we had some unmapped buffers
214 */ 214 */
215 if (all_mapped) { 215 if (all_mapped) {
216 char b[BDEVNAME_SIZE];
217
216 printk("__find_get_block_slow() failed. " 218 printk("__find_get_block_slow() failed. "
217 "block=%llu, b_blocknr=%llu\n", 219 "block=%llu, b_blocknr=%llu\n",
218 (unsigned long long)block, 220 (unsigned long long)block,
219 (unsigned long long)bh->b_blocknr); 221 (unsigned long long)bh->b_blocknr);
220 printk("b_state=0x%08lx, b_size=%zu\n", 222 printk("b_state=0x%08lx, b_size=%zu\n",
221 bh->b_state, bh->b_size); 223 bh->b_state, bh->b_size);
222 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); 224 printk("device %s blocksize: %d\n", bdevname(bdev, b),
225 1 << bd_inode->i_blkbits);
223 } 226 }
224out_unlock: 227out_unlock:
225 spin_unlock(&bd_mapping->private_lock); 228 spin_unlock(&bd_mapping->private_lock);
@@ -285,7 +288,7 @@ static void free_more_memory(void)
285 struct zone *zone; 288 struct zone *zone;
286 int nid; 289 int nid;
287 290
288 wakeup_flusher_threads(1024); 291 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
289 yield(); 292 yield();
290 293
291 for_each_online_node(nid) { 294 for_each_online_node(nid) {
@@ -1470,13 +1473,13 @@ static void discard_buffer(struct buffer_head * bh)
1470} 1473}
1471 1474
1472/** 1475/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page 1476 * block_invalidatepage - invalidate part or all of a buffer-backed page
1474 * 1477 *
1475 * @page: the page which is affected 1478 * @page: the page which is affected
1476 * @offset: the index of the truncation point 1479 * @offset: the index of the truncation point
1477 * 1480 *
1478 * block_invalidatepage() is called when all or part of the page has become 1481 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation. 1482 * invalidated by a truncate operation.
1480 * 1483 *
1481 * block_invalidatepage() does not have to release all buffers, but it must 1484 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O 1485 * ensure that no dirty buffer is left outside @offset and that no I/O
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5a3953db811..4144caf2f9d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page)
228} 228}
229 229
230/* 230/*
231 * Build a vector of contiguous pages from the provided page list. 231 * Finish an async read(ahead) op.
232 */ 232 */
233static struct page **page_vector_from_list(struct list_head *page_list, 233static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
234 unsigned *nr_pages)
235{ 234{
236 struct page **pages; 235 struct inode *inode = req->r_inode;
237 struct page *page; 236 struct ceph_osd_reply_head *replyhead;
238 int next_index, contig_pages = 0; 237 int rc, bytes;
238 int i;
239 239
240 /* build page vector */ 240 /* parse reply */
241 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); 241 replyhead = msg->front.iov_base;
242 if (!pages) 242 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
243 return ERR_PTR(-ENOMEM); 243 rc = le32_to_cpu(replyhead->result);
244 bytes = le32_to_cpu(msg->hdr.data_len);
244 245
245 BUG_ON(list_empty(page_list)); 246 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
246 next_index = list_entry(page_list->prev, struct page, lru)->index; 247
247 list_for_each_entry_reverse(page, page_list, lru) { 248 /* unlock all pages, zeroing any data we didn't read */
248 if (page->index == next_index) { 249 for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
249 dout("readpages page %d %p\n", contig_pages, page); 250 struct page *page = req->r_pages[i];
250 pages[contig_pages] = page; 251
251 contig_pages++; 252 if (bytes < (int)PAGE_CACHE_SIZE) {
252 next_index++; 253 /* zero (remainder of) page */
253 } else { 254 int s = bytes < 0 ? 0 : bytes;
254 break; 255 zero_user_segment(page, s, PAGE_CACHE_SIZE);
255 } 256 }
257 dout("finish_read %p uptodate %p idx %lu\n", inode, page,
258 page->index);
259 flush_dcache_page(page);
260 SetPageUptodate(page);
261 unlock_page(page);
262 page_cache_release(page);
256 } 263 }
257 *nr_pages = contig_pages; 264 kfree(req->r_pages);
258 return pages;
259} 265}
260 266
261/* 267/*
262 * Read multiple pages. Leave pages we don't read + unlock in page_list; 268 * start an async read(ahead) operation. return nr_pages we submitted
263 * the caller (VM) cleans them up. 269 * a read for on success, or negative error code.
264 */ 270 */
265static int ceph_readpages(struct file *file, struct address_space *mapping, 271static int start_read(struct inode *inode, struct list_head *page_list, int max)
266 struct list_head *page_list, unsigned nr_pages)
267{ 272{
268 struct inode *inode = file->f_dentry->d_inode;
269 struct ceph_inode_info *ci = ceph_inode(inode);
270 struct ceph_osd_client *osdc = 273 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc; 274 &ceph_inode_to_client(inode)->client->osdc;
272 int rc = 0; 275 struct ceph_inode_info *ci = ceph_inode(inode);
273 struct page **pages; 276 struct page *page = list_entry(page_list->prev, struct page, lru);
274 loff_t offset; 277 struct ceph_osd_request *req;
278 u64 off;
275 u64 len; 279 u64 len;
280 int i;
281 struct page **pages;
282 pgoff_t next_index;
283 int nr_pages = 0;
284 int ret;
276 285
277 dout("readpages %p file %p nr_pages %d\n", 286 off = page->index << PAGE_CACHE_SHIFT;
278 inode, file, nr_pages);
279
280 pages = page_vector_from_list(page_list, &nr_pages);
281 if (IS_ERR(pages))
282 return PTR_ERR(pages);
283 287
284 /* guess read extent */ 288 /* count pages */
285 offset = pages[0]->index << PAGE_CACHE_SHIFT; 289 next_index = page->index;
290 list_for_each_entry_reverse(page, page_list, lru) {
291 if (page->index != next_index)
292 break;
293 nr_pages++;
294 next_index++;
295 if (max && nr_pages == max)
296 break;
297 }
286 len = nr_pages << PAGE_CACHE_SHIFT; 298 len = nr_pages << PAGE_CACHE_SHIFT;
287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 299 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
288 offset, &len, 300 off, len);
289 ci->i_truncate_seq, ci->i_truncate_size, 301
290 pages, nr_pages, 0); 302 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
291 if (rc == -ENOENT) 303 off, &len,
292 rc = 0; 304 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
293 if (rc < 0) 305 NULL, 0,
294 goto out; 306 ci->i_truncate_seq, ci->i_truncate_size,
295 307 NULL, false, 1, 0);
296 for (; !list_empty(page_list) && len > 0; 308 if (!req)
297 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 309 return -ENOMEM;
298 struct page *page =
299 list_entry(page_list->prev, struct page, lru);
300 310
311 /* build page vector */
312 nr_pages = len >> PAGE_CACHE_SHIFT;
313 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
314 ret = -ENOMEM;
315 if (!pages)
316 goto out;
317 for (i = 0; i < nr_pages; ++i) {
318 page = list_entry(page_list->prev, struct page, lru);
319 BUG_ON(PageLocked(page));
301 list_del(&page->lru); 320 list_del(&page->lru);
302 321
303 if (rc < (int)PAGE_CACHE_SIZE) { 322 dout("start_read %p adding %p idx %lu\n", inode, page,
304 /* zero (remainder of) page */ 323 page->index);
305 int s = rc < 0 ? 0 : rc; 324 if (add_to_page_cache_lru(page, &inode->i_data, page->index,
306 zero_user_segment(page, s, PAGE_CACHE_SIZE);
307 }
308
309 if (add_to_page_cache_lru(page, mapping, page->index,
310 GFP_NOFS)) { 325 GFP_NOFS)) {
311 page_cache_release(page); 326 page_cache_release(page);
312 dout("readpages %p add_to_page_cache failed %p\n", 327 dout("start_read %p add_to_page_cache failed %p\n",
313 inode, page); 328 inode, page);
314 continue; 329 nr_pages = i;
330 goto out_pages;
315 } 331 }
316 dout("readpages %p adding %p idx %lu\n", inode, page, 332 pages[i] = page;
317 page->index);
318 flush_dcache_page(page);
319 SetPageUptodate(page);
320 unlock_page(page);
321 page_cache_release(page);
322 } 333 }
323 rc = 0; 334 req->r_pages = pages;
335 req->r_num_pages = nr_pages;
336 req->r_callback = finish_read;
337 req->r_inode = inode;
338
339 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
340 ret = ceph_osdc_start_request(osdc, req, false);
341 if (ret < 0)
342 goto out_pages;
343 ceph_osdc_put_request(req);
344 return nr_pages;
324 345
346out_pages:
347 ceph_release_page_vector(pages, nr_pages);
348out:
349 ceph_osdc_put_request(req);
350 return ret;
351}
352
353
354/*
355 * Read multiple pages. Leave pages we don't read + unlock in page_list;
356 * the caller (VM) cleans them up.
357 */
358static int ceph_readpages(struct file *file, struct address_space *mapping,
359 struct list_head *page_list, unsigned nr_pages)
360{
361 struct inode *inode = file->f_dentry->d_inode;
362 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
363 int rc = 0;
364 int max = 0;
365
366 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
367 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
368 >> PAGE_SHIFT;
369
370 dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
371 max);
372 while (!list_empty(page_list)) {
373 rc = start_read(inode, page_list, max);
374 if (rc < 0)
375 goto out;
376 BUG_ON(rc == 0);
377 }
325out: 378out:
326 kfree(pages); 379 dout("readpages %p file %p ret %d\n", inode, file, rc);
327 return rc; 380 return rc;
328} 381}
329 382
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8d74ad7ba55..0f327c6c967 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
487 ci->i_rdcache_gen++; 487 ci->i_rdcache_gen++;
488 488
489 /* 489 /*
490 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we 490 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
491 * don't know what happened to this directory while we didn't 491 * don't know what happened to this directory while we didn't
492 * have the cap. 492 * have the cap.
493 */ 493 */
494 if ((issued & CEPH_CAP_FILE_SHARED) && 494 if ((issued & CEPH_CAP_FILE_SHARED) &&
495 (had & CEPH_CAP_FILE_SHARED) == 0) { 495 (had & CEPH_CAP_FILE_SHARED) == 0) {
496 ci->i_shared_gen++; 496 ci->i_shared_gen++;
497 if (S_ISDIR(ci->vfs_inode.i_mode)) { 497 if (S_ISDIR(ci->vfs_inode.i_mode))
498 dout(" marking %p NOT complete\n", &ci->vfs_inode); 498 ceph_dir_clear_complete(&ci->vfs_inode);
499 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
500 }
501 } 499 }
502} 500}
503 501
@@ -945,7 +943,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
945 seq, issue_seq, mseq, follows, size, max_size, 943 seq, issue_seq, mseq, follows, size, max_size,
946 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 944 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
947 945
948 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); 946 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
949 if (!msg) 947 if (!msg)
950 return -ENOMEM; 948 return -ENOMEM;
951 949
@@ -2363,7 +2361,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2363 } 2361 }
2364 2362
2365 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 2363 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2366 inode->i_nlink = le32_to_cpu(grant->nlink); 2364 set_nlink(inode, le32_to_cpu(grant->nlink));
2367 2365
2368 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { 2366 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2369 int len = le32_to_cpu(grant->xattr_len); 2367 int len = le32_to_cpu(grant->xattr_len);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a5..2abd0dfad7f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
108 * falling back to a "normal" sync readdir if any dentries in the dir 108 * falling back to a "normal" sync readdir if any dentries in the dir
109 * are dropped. 109 * are dropped.
110 * 110 *
111 * I_COMPLETE tells indicates we have all dentries in the dir. It is 111 * D_COMPLETE tells indicates we have all dentries in the dir. It is
112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
113 * the MDS if/when the directory is modified). 113 * the MDS if/when the directory is modified).
114 */ 114 */
@@ -199,8 +199,8 @@ more:
199 filp->f_pos++; 199 filp->f_pos++;
200 200
201 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 201 /* make sure a dentry wasn't dropped while we didn't have parent lock */
202 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { 202 if (!ceph_dir_test_complete(dir)) {
203 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 203 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
204 err = -EAGAIN; 204 err = -EAGAIN;
205 goto out; 205 goto out;
206 } 206 }
@@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
285 if ((filp->f_pos == 2 || fi->dentry) && 285 if ((filp->f_pos == 2 || fi->dentry) &&
286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287 ceph_snap(inode) != CEPH_SNAPDIR && 287 ceph_snap(inode) != CEPH_SNAPDIR &&
288 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 288 ceph_dir_test_complete(inode) &&
289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290 spin_unlock(&inode->i_lock); 290 spin_unlock(&inode->i_lock);
291 err = __dcache_readdir(filp, dirent, filldir); 291 err = __dcache_readdir(filp, dirent, filldir);
@@ -351,7 +351,7 @@ more:
351 351
352 if (!req->r_did_prepopulate) { 352 if (!req->r_did_prepopulate) {
353 dout("readdir !did_prepopulate"); 353 dout("readdir !did_prepopulate");
354 fi->dir_release_count--; /* preclude I_COMPLETE */ 354 fi->dir_release_count--; /* preclude D_COMPLETE */
355 } 355 }
356 356
357 /* note next offset and last dentry name */ 357 /* note next offset and last dentry name */
@@ -430,8 +430,7 @@ more:
430 */ 430 */
431 spin_lock(&inode->i_lock); 431 spin_lock(&inode->i_lock);
432 if (ci->i_release_count == fi->dir_release_count) { 432 if (ci->i_release_count == fi->dir_release_count) {
433 dout(" marking %p complete\n", inode); 433 ceph_dir_set_complete(inode);
434 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
435 ci->i_max_offset = filp->f_pos; 434 ci->i_max_offset = filp->f_pos;
436 } 435 }
437 spin_unlock(&inode->i_lock); 436 spin_unlock(&inode->i_lock);
@@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
614 fsc->mount_options->snapdir_name, 613 fsc->mount_options->snapdir_name,
615 dentry->d_name.len) && 614 dentry->d_name.len) &&
616 !is_root_ceph_dentry(dir, dentry) && 615 !is_root_ceph_dentry(dir, dentry) &&
617 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 616 ceph_dir_test_complete(dir) &&
618 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
619 spin_unlock(&dir->i_lock); 618 spin_unlock(&dir->i_lock);
620 dout(" dir %p complete, -ENOENT\n", dir); 619 dout(" dir %p complete, -ENOENT\n", dir);
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
934 */ 933 */
935 934
936 /* d_move screws up d_subdirs order */ 935 /* d_move screws up d_subdirs order */
937 ceph_i_clear(new_dir, CEPH_I_COMPLETE); 936 ceph_dir_clear_complete(new_dir);
938 937
939 d_move(old_dentry, new_dentry); 938 d_move(old_dentry, new_dentry);
940 939
@@ -1092,7 +1091,75 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1092 return 1; 1091 return 1;
1093} 1092}
1094 1093
1094/*
1095 * Set/clear/test dir complete flag on the dir's dentry.
1096 */
1097static struct dentry * __d_find_any_alias(struct inode *inode)
1098{
1099 struct dentry *alias;
1100
1101 if (list_empty(&inode->i_dentry))
1102 return NULL;
1103 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1104 return alias;
1105}
1106
1107void ceph_dir_set_complete(struct inode *inode)
1108{
1109 struct dentry *dentry = __d_find_any_alias(inode);
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115}
1116
1117void ceph_dir_clear_complete(struct inode *inode)
1118{
1119 struct dentry *dentry = __d_find_any_alias(inode);
1120
1121 if (dentry && ceph_dentry(dentry)) {
1122 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1123 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1124 }
1125}
1126
1127bool ceph_dir_test_complete(struct inode *inode)
1128{
1129 struct dentry *dentry = __d_find_any_alias(inode);
1130
1131 if (dentry && ceph_dentry(dentry))
1132 return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1133 return false;
1134}
1135
1136/*
1137 * When the VFS prunes a dentry from the cache, we need to clear the
1138 * complete flag on the parent directory.
1139 *
1140 * Called under dentry->d_lock.
1141 */
1142static void ceph_d_prune(struct dentry *dentry)
1143{
1144 struct ceph_dentry_info *di;
1145
1146 dout("d_release %p\n", dentry);
1147
1148 /* do we have a valid parent? */
1149 if (!dentry->d_parent || IS_ROOT(dentry))
1150 return;
1095 1151
1152 /* if we are not hashed, we don't affect D_COMPLETE */
1153 if (d_unhashed(dentry))
1154 return;
1155
1156 /*
1157 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1158 * cleared until d_release
1159 */
1160 di = ceph_dentry(dentry->d_parent);
1161 clear_bit(CEPH_D_COMPLETE, &di->flags);
1162}
1096 1163
1097/* 1164/*
1098 * read() on a dir. This weird interface hack only works if mounted 1165 * read() on a dir. This weird interface hack only works if mounted
@@ -1306,6 +1373,7 @@ const struct inode_operations ceph_dir_iops = {
1306const struct dentry_operations ceph_dentry_ops = { 1373const struct dentry_operations ceph_dentry_ops = {
1307 .d_revalidate = ceph_d_revalidate, 1374 .d_revalidate = ceph_d_revalidate,
1308 .d_release = ceph_d_release, 1375 .d_release = ceph_d_release,
1376 .d_prune = ceph_d_prune,
1309}; 1377};
1310 1378
1311const struct dentry_operations ceph_snapdir_dentry_ops = { 1379const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1383,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
1315 1383
1316const struct dentry_operations ceph_snap_dentry_ops = { 1384const struct dentry_operations ceph_snap_dentry_ops = {
1317 .d_release = ceph_d_release, 1385 .d_release = ceph_d_release,
1386 .d_prune = ceph_d_prune,
1318}; 1387};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 095799ba9dd..e392bfce84a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,7 +9,6 @@
9#include <linux/namei.h> 9#include <linux/namei.h>
10#include <linux/writeback.h> 10#include <linux/writeback.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/pagevec.h>
13 12
14#include "super.h" 13#include "super.h"
15#include "mds_client.h" 14#include "mds_client.h"
@@ -619,7 +618,7 @@ static int fill_inode(struct inode *inode,
619 } 618 }
620 619
621 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 620 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
622 inode->i_nlink = le32_to_cpu(info->nlink); 621 set_nlink(inode, le32_to_cpu(info->nlink));
623 622
624 /* be careful with mtime, atime, size */ 623 /* be careful with mtime, atime, size */
625 ceph_decode_timespec(&atime, &info->atime); 624 ceph_decode_timespec(&atime, &info->atime);
@@ -772,9 +771,9 @@ no_change:
772 ceph_snap(inode) == CEPH_NOSNAP && 771 ceph_snap(inode) == CEPH_NOSNAP &&
773 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 772 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
774 (issued & CEPH_CAP_FILE_EXCL) == 0 && 773 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
775 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 774 !ceph_dir_test_complete(inode)) {
776 dout(" marking %p complete (empty)\n", inode); 775 dout(" marking %p complete (empty)\n", inode);
777 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ 776 ceph_dir_set_complete(inode);
778 ci->i_max_offset = 2; 777 ci->i_max_offset = 2;
779 } 778 }
780 779
@@ -857,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
857 di = ceph_dentry(dn); 856 di = ceph_dentry(dn);
858 857
859 spin_lock(&inode->i_lock); 858 spin_lock(&inode->i_lock);
860 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 859 if (!ceph_dir_test_complete(inode)) {
861 spin_unlock(&inode->i_lock); 860 spin_unlock(&inode->i_lock);
862 return; 861 return;
863 } 862 }
@@ -1057,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1057 * d_move() puts the renamed dentry at the end of 1056 * d_move() puts the renamed dentry at the end of
1058 * d_subdirs. We need to assign it an appropriate 1057 * d_subdirs. We need to assign it an appropriate
1059 * directory offset so we can behave when holding 1058 * directory offset so we can behave when holding
1060 * I_COMPLETE. 1059 * D_COMPLETE.
1061 */ 1060 */
1062 ceph_set_dentry_offset(req->r_old_dentry); 1061 ceph_set_dentry_offset(req->r_old_dentry);
1063 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1062 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
@@ -1364,49 +1363,6 @@ void ceph_queue_invalidate(struct inode *inode)
1364} 1363}
1365 1364
1366/* 1365/*
1367 * invalidate any pages that are not dirty or under writeback. this
1368 * includes pages that are clean and mapped.
1369 */
1370static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1371{
1372 struct pagevec pvec;
1373 pgoff_t next = 0;
1374 int i;
1375
1376 pagevec_init(&pvec, 0);
1377 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1378 for (i = 0; i < pagevec_count(&pvec); i++) {
1379 struct page *page = pvec.pages[i];
1380 pgoff_t index;
1381 int skip_page =
1382 (PageDirty(page) || PageWriteback(page));
1383
1384 if (!skip_page)
1385 skip_page = !trylock_page(page);
1386
1387 /*
1388 * We really shouldn't be looking at the ->index of an
1389 * unlocked page. But we're not allowed to lock these
1390 * pages. So we rely upon nobody altering the ->index
1391 * of this (pinned-by-us) page.
1392 */
1393 index = page->index;
1394 if (index > next)
1395 next = index;
1396 next++;
1397
1398 if (skip_page)
1399 continue;
1400
1401 generic_error_remove_page(mapping, page);
1402 unlock_page(page);
1403 }
1404 pagevec_release(&pvec);
1405 cond_resched();
1406 }
1407}
1408
1409/*
1410 * Invalidate inode pages in a worker thread. (This can't be done 1366 * Invalidate inode pages in a worker thread. (This can't be done
1411 * in the message handler context.) 1367 * in the message handler context.)
1412 */ 1368 */
@@ -1429,7 +1385,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1429 orig_gen = ci->i_rdcache_gen; 1385 orig_gen = ci->i_rdcache_gen;
1430 spin_unlock(&inode->i_lock); 1386 spin_unlock(&inode->i_lock);
1431 1387
1432 ceph_invalidate_nondirty_pages(inode->i_mapping); 1388 truncate_inode_pages(&inode->i_data, 0);
1433 1389
1434 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1435 if (orig_gen == ci->i_rdcache_gen && 1391 if (orig_gen == ci->i_rdcache_gen &&
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b256b50f7d..5a14c29cbba 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
43 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
44 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
45 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
46 struct ceph_ioctl_layout nl;
45 int err, i; 47 int err, i;
46 48
47 /* copy and validate */
48 if (copy_from_user(&l, arg, sizeof(l))) 49 if (copy_from_user(&l, arg, sizeof(l)))
49 return -EFAULT; 50 return -EFAULT;
50 51
51 if ((l.object_size & ~PAGE_MASK) || 52 /* validate changed params against current layout */
52 (l.stripe_unit & ~PAGE_MASK) || 53 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
53 !l.stripe_unit || 54 if (!err) {
54 (l.object_size && 55 nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
55 (unsigned)l.object_size % (unsigned)l.stripe_unit)) 56 nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
57 nl.object_size = ceph_file_layout_object_size(ci->i_layout);
58 nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
59 nl.preferred_osd =
60 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
61 } else
62 return err;
63
64 if (l.stripe_count)
65 nl.stripe_count = l.stripe_count;
66 if (l.stripe_unit)
67 nl.stripe_unit = l.stripe_unit;
68 if (l.object_size)
69 nl.object_size = l.object_size;
70 if (l.data_pool)
71 nl.data_pool = l.data_pool;
72 if (l.preferred_osd)
73 nl.preferred_osd = l.preferred_osd;
74
75 if ((nl.object_size & ~PAGE_MASK) ||
76 (nl.stripe_unit & ~PAGE_MASK) ||
77 ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
56 return -EINVAL; 78 return -EINVAL;
57 79
58 /* make sure it's a valid data pool */ 80 /* make sure it's a valid data pool */
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 0c5167e4318..be4a6048733 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -6,7 +6,31 @@
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x97
8 8
9/* just use u64 to align sanely on all archs */ 9/*
10 * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
11 * CEPH_IOC_SET_LAYOUT - set file layout
12 * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
13 *
14 * The file layout specifies how file data is striped over objects in
15 * the distributed object store, which object pool they belong to (if
16 * it differs from the default), and an optional 'preferred osd' to
17 * store them on.
18 *
19 * Files get a new layout based on the policy set on the containing
20 * directory or one of its ancestors. The GET_LAYOUT ioctl will let
21 * you examine the layout for a file or the policy on a directory.
22 *
23 * SET_LAYOUT will let you set a layout on a newly created file. This
24 * only works immediately after the file is created and before any
25 * data is written to it.
26 *
27 * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
28 * on a directory that will apply to any new files created in that
29 * directory (or any child directory that doesn't specify a layout of
30 * its own).
31 */
32
33/* use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 34struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size; 35 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool; 36 __u64 data_pool;
@@ -21,6 +45,8 @@ struct ceph_ioctl_layout {
21 struct ceph_ioctl_layout) 45 struct ceph_ioctl_layout)
22 46
23/* 47/*
48 * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
49 *
24 * Extract identity, address of the OSD and object storing a given 50 * Extract identity, address of the OSD and object storing a given
25 * file offset. 51 * file offset.
26 */ 52 */
@@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc {
39#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 65#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
40 struct ceph_ioctl_dataloc) 66 struct ceph_ioctl_dataloc)
41 67
68/*
69 * CEPH_IOC_LAZYIO - relax consistency
70 *
71 * Normally Ceph switches to synchronous IO when multiple clients have
72 * the file open (and or more for write). Reads and writes bypass the
73 * page cache and go directly to the OSD. Setting this flag on a file
74 * descriptor will allow buffered IO for this file in cases where the
75 * application knows it won't interfere with other nodes (or doesn't
76 * care).
77 */
42#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) 78#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
79
80/*
81 * CEPH_IOC_SYNCIO - force synchronous IO
82 *
83 * This ioctl sets a file flag that forces the synchronous IO that
84 * bypasses the page cache, even if it is not necessary. This is
85 * essentially the opposite behavior of IOC_LAZYIO. This forces the
86 * same read/write path as a file opened by multiple clients when one
87 * or more of those clients is opened for write.
88 *
89 * Note that this type of sync IO takes a different path than a file
90 * opened with O_SYNC/D_SYNC (writes hit the page cache and are
91 * immediately flushed on page boundaries). It is very similar to
92 * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
93 * are not copied (user page must remain stable) and O_DIRECT writes
94 * have alignment restrictions (on the buffer and file offset).
95 */
43#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) 96#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
44 97
45#endif 98#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 86c59e16ba7..264ab701154 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
619 * 619 *
620 * Called under mdsc->mutex. 620 * Called under mdsc->mutex.
621 */ 621 */
622struct dentry *get_nonsnap_parent(struct dentry *dentry) 622static struct dentry *get_nonsnap_parent(struct dentry *dentry)
623{ 623{
624 /* 624 /*
625 * we don't need to worry about protecting the d_parent access 625 * we don't need to worry about protecting the d_parent access
@@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
764 struct ceph_msg *msg; 764 struct ceph_msg *msg;
765 struct ceph_mds_session_head *h; 765 struct ceph_mds_session_head *h;
766 766
767 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); 767 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
768 false);
768 if (!msg) { 769 if (!msg) {
769 pr_err("create_session_msg ENOMEM creating msg\n"); 770 pr_err("create_session_msg ENOMEM creating msg\n");
770 return NULL; 771 return NULL;
@@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1240 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1241 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1241 spin_unlock(&session->s_cap_lock); 1242 spin_unlock(&session->s_cap_lock);
1242 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1243 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1243 GFP_NOFS); 1244 GFP_NOFS, false);
1244 if (!msg) 1245 if (!msg)
1245 goto out_unlocked; 1246 goto out_unlocked;
1246 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1247 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1652,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1652 if (req->r_old_dentry_drop) 1653 if (req->r_old_dentry_drop)
1653 len += req->r_old_dentry->d_name.len; 1654 len += req->r_old_dentry->d_name.len;
1654 1655
1655 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); 1656 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
1656 if (!msg) { 1657 if (!msg) {
1657 msg = ERR_PTR(-ENOMEM); 1658 msg = ERR_PTR(-ENOMEM);
1658 goto out_free2; 1659 goto out_free2;
@@ -2001,7 +2002,7 @@ out:
2001} 2002}
2002 2003
2003/* 2004/*
2004 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS 2005 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
2005 * namespace request. 2006 * namespace request.
2006 */ 2007 */
2007void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2008void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2009,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2009 struct inode *inode = req->r_locked_dir; 2010 struct inode *inode = req->r_locked_dir;
2010 struct ceph_inode_info *ci = ceph_inode(inode); 2011 struct ceph_inode_info *ci = ceph_inode(inode);
2011 2012
2012 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); 2013 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
2013 spin_lock(&inode->i_lock); 2014 spin_lock(&inode->i_lock);
2014 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 2015 ceph_dir_clear_complete(inode);
2015 ci->i_release_count++; 2016 ci->i_release_count++;
2016 spin_unlock(&inode->i_lock); 2017 spin_unlock(&inode->i_lock);
2017 2018
@@ -2518,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2518 goto fail_nopagelist; 2519 goto fail_nopagelist;
2519 ceph_pagelist_init(pagelist); 2520 ceph_pagelist_init(pagelist);
2520 2521
2521 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); 2522 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
2522 if (!reply) 2523 if (!reply)
2523 goto fail_nomsg; 2524 goto fail_nomsg;
2524 2525
@@ -2831,7 +2832,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2831 dnamelen = dentry->d_name.len; 2832 dnamelen = dentry->d_name.len;
2832 len += dnamelen; 2833 len += dnamelen;
2833 2834
2834 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); 2835 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
2835 if (!msg) 2836 if (!msg)
2836 return; 2837 return;
2837 lease = msg->front.iov_base; 2838 lease = msg->front.iov_base;
@@ -3153,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3153/* 3154/*
3154 * true if all sessions are closed, or we force unmount 3155 * true if all sessions are closed, or we force unmount
3155 */ 3156 */
3156bool done_closing_sessions(struct ceph_mds_client *mdsc) 3157static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3157{ 3158{
3158 int i, n = 0; 3159 int i, n = 0;
3159 3160
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 88bacaf385d..a90846fac75 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
114enum { 114enum {
115 Opt_wsize, 115 Opt_wsize,
116 Opt_rsize, 116 Opt_rsize,
117 Opt_rasize,
117 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
118 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
119 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -136,6 +137,7 @@ enum {
136static match_table_t fsopt_tokens = { 137static match_table_t fsopt_tokens = {
137 {Opt_wsize, "wsize=%d"}, 138 {Opt_wsize, "wsize=%d"},
138 {Opt_rsize, "rsize=%d"}, 139 {Opt_rsize, "rsize=%d"},
140 {Opt_rasize, "rasize=%d"},
139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 141 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 142 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
141 {Opt_cap_release_safety, "cap_release_safety=%d"}, 143 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private)
196 case Opt_rsize: 198 case Opt_rsize:
197 fsopt->rsize = intval; 199 fsopt->rsize = intval;
198 break; 200 break;
201 case Opt_rasize:
202 fsopt->rasize = intval;
203 break;
199 case Opt_caps_wanted_delay_min: 204 case Opt_caps_wanted_delay_min:
200 fsopt->caps_wanted_delay_min = intval; 205 fsopt->caps_wanted_delay_min = intval;
201 break; 206 break;
@@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
289 294
290 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); 295 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
291 296
292 fsopt->sb_flags = flags; 297 fsopt->sb_flags = flags;
293 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 298 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
294 299
295 fsopt->rsize = CEPH_RSIZE_DEFAULT; 300 fsopt->rsize = CEPH_RSIZE_DEFAULT;
296 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 301 fsopt->rasize = CEPH_RASIZE_DEFAULT;
302 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
297 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 303 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
298 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 304 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
299 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 305 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
300 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 306 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
301 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 307 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
302 fsopt->congestion_kb = default_congestion_kb(); 308 fsopt->congestion_kb = default_congestion_kb();
303 309
304 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 310 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
305 err = -EINVAL; 311 err = -EINVAL;
306 if (!dev_name) 312 if (!dev_name)
307 goto out; 313 goto out;
308 *path = strstr(dev_name, ":/"); 314 *path = strstr(dev_name, ":/");
309 if (*path == NULL) { 315 if (*path == NULL) {
310 pr_err("device name is missing path (no :/ in %s)\n", 316 pr_err("device name is missing path (no :/ in %s)\n",
311 dev_name); 317 dev_name);
312 goto out; 318 goto out;
313 } 319 }
314 dev_name_end = *path; 320 dev_name_end = *path;
315 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 321 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
316 322
@@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
376 seq_printf(m, ",wsize=%d", fsopt->wsize); 382 seq_printf(m, ",wsize=%d", fsopt->wsize);
377 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 383 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
378 seq_printf(m, ",rsize=%d", fsopt->rsize); 384 seq_printf(m, ",rsize=%d", fsopt->rsize);
385 if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
386 seq_printf(m, ",rasize=%d", fsopt->rsize);
379 if (fsopt->congestion_kb != default_congestion_kb()) 387 if (fsopt->congestion_kb != default_congestion_kb())
380 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); 388 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
381 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) 389 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -418,24 +426,27 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
418/* 426/*
419 * create a new fs client 427 * create a new fs client
420 */ 428 */
421struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, 429static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
422 struct ceph_options *opt) 430 struct ceph_options *opt)
423{ 431{
424 struct ceph_fs_client *fsc; 432 struct ceph_fs_client *fsc;
433 const unsigned supported_features =
434 CEPH_FEATURE_FLOCK |
435 CEPH_FEATURE_DIRLAYOUTHASH;
436 const unsigned required_features = 0;
425 int err = -ENOMEM; 437 int err = -ENOMEM;
426 438
427 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); 439 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
428 if (!fsc) 440 if (!fsc)
429 return ERR_PTR(-ENOMEM); 441 return ERR_PTR(-ENOMEM);
430 442
431 fsc->client = ceph_create_client(opt, fsc); 443 fsc->client = ceph_create_client(opt, fsc, supported_features,
444 required_features);
432 if (IS_ERR(fsc->client)) { 445 if (IS_ERR(fsc->client)) {
433 err = PTR_ERR(fsc->client); 446 err = PTR_ERR(fsc->client);
434 goto fail; 447 goto fail;
435 } 448 }
436 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 449 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
437 fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
438 CEPH_FEATURE_DIRLAYOUTHASH;
439 fsc->client->monc.want_mdsmap = 1; 450 fsc->client->monc.want_mdsmap = 1;
440 451
441 fsc->mount_options = fsopt; 452 fsc->mount_options = fsopt;
@@ -491,7 +502,7 @@ fail:
491 return ERR_PTR(err); 502 return ERR_PTR(err);
492} 503}
493 504
494void destroy_fs_client(struct ceph_fs_client *fsc) 505static void destroy_fs_client(struct ceph_fs_client *fsc)
495{ 506{
496 dout("destroy_fs_client %p\n", fsc); 507 dout("destroy_fs_client %p\n", fsc);
497 508
@@ -774,10 +785,10 @@ static int ceph_register_bdi(struct super_block *sb,
774{ 785{
775 int err; 786 int err;
776 787
777 /* set ra_pages based on rsize mount option? */ 788 /* set ra_pages based on rasize mount option? */
778 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 789 if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
779 fsc->backing_dev_info.ra_pages = 790 fsc->backing_dev_info.ra_pages =
780 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 791 (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
781 >> PAGE_SHIFT; 792 >> PAGE_SHIFT;
782 else 793 else
783 fsc->backing_dev_info.ra_pages = 794 fsc->backing_dev_info.ra_pages =
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a23eed526f0..01bf189e08a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
36#define ceph_test_mount_opt(fsc, opt) \ 36#define ceph_test_mount_opt(fsc, opt) \
37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
38 38
39#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ 39#define CEPH_RSIZE_DEFAULT 0 /* max read size */
40#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
40#define CEPH_MAX_READDIR_DEFAULT 1024 41#define CEPH_MAX_READDIR_DEFAULT 1024
41#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 42#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
42#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 43#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
@@ -45,8 +46,9 @@ struct ceph_mount_options {
45 int flags; 46 int flags;
46 int sb_flags; 47 int sb_flags;
47 48
48 int wsize; 49 int wsize; /* max write size */
49 int rsize; /* max readahead */ 50 int rsize; /* max read size */
51 int rasize; /* max readahead */
50 int congestion_kb; /* max writeback in flight */ 52 int congestion_kb; /* max writeback in flight */
51 int caps_wanted_delay_min, caps_wanted_delay_max; 53 int caps_wanted_delay_min, caps_wanted_delay_max;
52 int cap_release_safety; 54 int cap_release_safety;
@@ -201,6 +203,7 @@ struct ceph_inode_xattr {
201 * Ceph dentry state 203 * Ceph dentry state
202 */ 204 */
203struct ceph_dentry_info { 205struct ceph_dentry_info {
206 unsigned long flags;
204 struct ceph_mds_session *lease_session; 207 struct ceph_mds_session *lease_session;
205 u32 lease_gen, lease_shared_gen; 208 u32 lease_gen, lease_shared_gen;
206 u32 lease_seq; 209 u32 lease_seq;
@@ -211,6 +214,18 @@ struct ceph_dentry_info {
211 u64 offset; 214 u64 offset;
212}; 215};
213 216
217/*
218 * dentry flags
219 *
220 * The locking for D_COMPLETE is a bit odd:
221 * - we can clear it at almost any time (see ceph_d_prune)
222 * - it is only meaningful if:
223 * - we hold dir inode i_lock
224 * - we hold dir FILE_SHARED caps
225 * - the dentry D_COMPLETE is set
226 */
227#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
228
214struct ceph_inode_xattrs_info { 229struct ceph_inode_xattrs_info {
215 /* 230 /*
216 * (still encoded) xattr blob. we avoid the overhead of parsing 231 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -249,7 +264,7 @@ struct ceph_inode_info {
249 struct timespec i_rctime; 264 struct timespec i_rctime;
250 u64 i_rbytes, i_rfiles, i_rsubdirs; 265 u64 i_rbytes, i_rfiles, i_rsubdirs;
251 u64 i_files, i_subdirs; 266 u64 i_files, i_subdirs;
252 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ 267 u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
253 268
254 struct rb_root i_fragtree; 269 struct rb_root i_fragtree;
255 struct mutex i_fragtree_mutex; 270 struct mutex i_fragtree_mutex;
@@ -344,9 +359,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
344 * x86_64+ino32 64 32 359 * x86_64+ino32 64 32
345 * x86_64 64 64 360 * x86_64 64 64
346 */ 361 */
347static inline u32 ceph_ino_to_ino32(ino_t ino) 362static inline u32 ceph_ino_to_ino32(__u64 vino)
348{ 363{
349 ino ^= ino >> (sizeof(ino) * 8 - 32); 364 u32 ino = vino & 0xffffffff;
365 ino ^= vino >> 32;
350 if (!ino) 366 if (!ino)
351 ino = 1; 367 ino = 1;
352 return ino; 368 return ino;
@@ -357,11 +373,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino)
357 */ 373 */
358static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) 374static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
359{ 375{
360 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
361#if BITS_PER_LONG == 32 376#if BITS_PER_LONG == 32
362 ino = ceph_ino_to_ino32(ino); 377 return ceph_ino_to_ino32(vino.ino);
378#else
379 return (ino_t)vino.ino;
363#endif 380#endif
364 return ino;
365} 381}
366 382
367/* 383/*
@@ -413,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
413/* 429/*
414 * Ceph inode. 430 * Ceph inode.
415 */ 431 */
416#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
417#define CEPH_I_NODELAY 4 /* do not delay cap release */ 432#define CEPH_I_NODELAY 4 /* do not delay cap release */
418#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 433#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
419#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 434#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
@@ -471,6 +486,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
471} 486}
472 487
473/* 488/*
489 * set/clear directory D_COMPLETE flag
490 */
491void ceph_dir_set_complete(struct inode *inode);
492void ceph_dir_clear_complete(struct inode *inode);
493bool ceph_dir_test_complete(struct inode *inode);
494
495/*
474 * caps helpers 496 * caps helpers
475 */ 497 */
476static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) 498static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f..895da1dc155 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
745create cifs.spnego * * /usr/local/sbin/cifs.upcall %k 745create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
746create dns_resolver * * /usr/local/sbin/cifs.upcall %k 746create dns_resolver * * /usr/local/sbin/cifs.upcall %k
747 747
748CIFS kernel module parameters
749=============================
750These module parameters can be specified or modified either during the time of
751module loading or during the runtime by using the interface
752 /proc/module/cifs/parameters/<param>
753
754i.e. echo "value" > /sys/module/cifs/parameters/<param>
755
7561. echo_retries - The number of echo attempts before giving up and
757 reconnecting to the server. The default is 5. The value 0
758 means never reconnect.
759
7602. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
761 [Y/y/1]. To disable use any of [N/n/0].
748 762
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 6d40656e1e2..84e8c072470 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
511 511
512static int cifs_oplock_proc_show(struct seq_file *m, void *v) 512static int cifs_oplock_proc_show(struct seq_file *m, void *v)
513{ 513{
514 seq_printf(m, "%d\n", oplockEnabled); 514 seq_printf(m, "%d\n", enable_oplocks);
515 return 0; 515 return 0;
516} 516}
517 517
@@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
526 char c; 526 char c;
527 int rc; 527 int rc;
528 528
529 printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
530 "will be removed in kernel version 3.4. Please migrate to "
531 "using the 'enable_oplocks' module parameter in cifs.ko.\n");
529 rc = get_user(c, buffer); 532 rc = get_user(c, buffer);
530 if (rc) 533 if (rc)
531 return rc; 534 return rc;
532 if (c == '0' || c == 'n' || c == 'N') 535 if (c == '0' || c == 'n' || c == 'N')
533 oplockEnabled = 0; 536 enable_oplocks = false;
534 else if (c == '1' || c == 'y' || c == 'Y') 537 else if (c == '1' || c == 'y' || c == 'Y')
535 oplockEnabled = 1; 538 enable_oplocks = true;
536 539
537 return count; 540 return count;
538} 541}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f..500d6585927 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ 43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
44#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ 44#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ 45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
46#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
47#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
46 48
47struct cifs_sb_info { 49struct cifs_sb_info {
48 struct rb_root tlink_tree; 50 struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
55 atomic_t active; 57 atomic_t active;
56 uid_t mnt_uid; 58 uid_t mnt_uid;
57 gid_t mnt_gid; 59 gid_t mnt_gid;
60 uid_t mnt_backupuid;
61 gid_t mnt_backupgid;
58 mode_t mnt_file_mode; 62 mode_t mnt_file_mode;
59 mode_t mnt_dir_mode; 63 mode_t mnt_dir_mode;
60 unsigned int mnt_cifs_flags; 64 unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index d0f59faefb7..72ddf23ef6f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); 91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
92 spin_unlock(&sidgidlock); 92 spin_unlock(&sidgidlock);
93 93
94 root = &siduidtree;
95 spin_lock(&uidsidlock);
96 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
97 spin_unlock(&uidsidlock);
98
99 root = &sidgidtree;
100 spin_lock(&gidsidlock);
101 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
102 spin_unlock(&gidsidlock);
103
94 return nr_rem; 104 return nr_rem;
95} 105}
96 106
107static void
108sid_rb_insert(struct rb_root *root, unsigned long cid,
109 struct cifs_sid_id **psidid, char *typestr)
110{
111 char *strptr;
112 struct rb_node *node = root->rb_node;
113 struct rb_node *parent = NULL;
114 struct rb_node **linkto = &(root->rb_node);
115 struct cifs_sid_id *lsidid;
116
117 while (node) {
118 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
119 parent = node;
120 if (cid > lsidid->id) {
121 linkto = &(node->rb_left);
122 node = node->rb_left;
123 }
124 if (cid < lsidid->id) {
125 linkto = &(node->rb_right);
126 node = node->rb_right;
127 }
128 }
129
130 (*psidid)->id = cid;
131 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
132 (*psidid)->refcount = 0;
133
134 sprintf((*psidid)->sidstr, "%s", typestr);
135 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
136 sprintf(strptr, "%ld", cid);
137
138 clear_bit(SID_ID_PENDING, &(*psidid)->state);
139 clear_bit(SID_ID_MAPPED, &(*psidid)->state);
140
141 rb_link_node(&(*psidid)->rbnode, parent, linkto);
142 rb_insert_color(&(*psidid)->rbnode, root);
143}
144
145static struct cifs_sid_id *
146sid_rb_search(struct rb_root *root, unsigned long cid)
147{
148 struct rb_node *node = root->rb_node;
149 struct cifs_sid_id *lsidid;
150
151 while (node) {
152 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
153 if (cid > lsidid->id)
154 node = node->rb_left;
155 else if (cid < lsidid->id)
156 node = node->rb_right;
157 else /* node found */
158 return lsidid;
159 }
160
161 return NULL;
162}
163
97static struct shrinker cifs_shrinker = { 164static struct shrinker cifs_shrinker = {
98 .shrink = cifs_idmap_shrinker, 165 .shrink = cifs_idmap_shrinker,
99 .seeks = DEFAULT_SEEKS, 166 .seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
110 177
111 memcpy(payload, data, datalen); 178 memcpy(payload, data, datalen);
112 key->payload.data = payload; 179 key->payload.data = payload;
180 key->datalen = datalen;
113 return 0; 181 return 0;
114} 182}
115 183
@@ -224,6 +292,120 @@ sidid_pending_wait(void *unused)
224} 292}
225 293
226static int 294static int
295id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
296{
297 int rc = 0;
298 struct key *sidkey;
299 const struct cred *saved_cred;
300 struct cifs_sid *lsid;
301 struct cifs_sid_id *psidid, *npsidid;
302 struct rb_root *cidtree;
303 spinlock_t *cidlock;
304
305 if (sidtype == SIDOWNER) {
306 cidlock = &siduidlock;
307 cidtree = &uidtree;
308 } else if (sidtype == SIDGROUP) {
309 cidlock = &sidgidlock;
310 cidtree = &gidtree;
311 } else
312 return -EINVAL;
313
314 spin_lock(cidlock);
315 psidid = sid_rb_search(cidtree, cid);
316
317 if (!psidid) { /* node does not exist, allocate one & attempt adding */
318 spin_unlock(cidlock);
319 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
320 if (!npsidid)
321 return -ENOMEM;
322
323 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
324 if (!npsidid->sidstr) {
325 kfree(npsidid);
326 return -ENOMEM;
327 }
328
329 spin_lock(cidlock);
330 psidid = sid_rb_search(cidtree, cid);
331 if (psidid) { /* node happened to get inserted meanwhile */
332 ++psidid->refcount;
333 spin_unlock(cidlock);
334 kfree(npsidid->sidstr);
335 kfree(npsidid);
336 } else {
337 psidid = npsidid;
338 sid_rb_insert(cidtree, cid, &psidid,
339 sidtype == SIDOWNER ? "oi:" : "gi:");
340 ++psidid->refcount;
341 spin_unlock(cidlock);
342 }
343 } else {
344 ++psidid->refcount;
345 spin_unlock(cidlock);
346 }
347
348 /*
349 * If we are here, it is safe to access psidid and its fields
350 * since a reference was taken earlier while holding the spinlock.
351 * A reference on the node is put without holding the spinlock
352 * and it is OK to do so in this case, shrinker will not erase
353 * this node until all references are put and we do not access
354 * any fields of the node after a reference is put .
355 */
356 if (test_bit(SID_ID_MAPPED, &psidid->state)) {
357 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
358 psidid->time = jiffies; /* update ts for accessing */
359 goto id_sid_out;
360 }
361
362 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
363 rc = -EINVAL;
364 goto id_sid_out;
365 }
366
367 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
368 saved_cred = override_creds(root_cred);
369 sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
370 if (IS_ERR(sidkey)) {
371 rc = -EINVAL;
372 cFYI(1, "%s: Can't map and id to a SID", __func__);
373 } else {
374 lsid = (struct cifs_sid *)sidkey->payload.data;
375 memcpy(&psidid->sid, lsid,
376 sidkey->datalen < sizeof(struct cifs_sid) ?
377 sidkey->datalen : sizeof(struct cifs_sid));
378 memcpy(ssid, &psidid->sid,
379 sidkey->datalen < sizeof(struct cifs_sid) ?
380 sidkey->datalen : sizeof(struct cifs_sid));
381 set_bit(SID_ID_MAPPED, &psidid->state);
382 key_put(sidkey);
383 kfree(psidid->sidstr);
384 }
385 psidid->time = jiffies; /* update ts for accessing */
386 revert_creds(saved_cred);
387 clear_bit(SID_ID_PENDING, &psidid->state);
388 wake_up_bit(&psidid->state, SID_ID_PENDING);
389 } else {
390 rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
391 sidid_pending_wait, TASK_INTERRUPTIBLE);
392 if (rc) {
393 cFYI(1, "%s: sidid_pending_wait interrupted %d",
394 __func__, rc);
395 --psidid->refcount;
396 return rc;
397 }
398 if (test_bit(SID_ID_MAPPED, &psidid->state))
399 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
400 else
401 rc = -EINVAL;
402 }
403id_sid_out:
404 --psidid->refcount;
405 return rc;
406}
407
408static int
227sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, 409sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
228 struct cifs_fattr *fattr, uint sidtype) 410 struct cifs_fattr *fattr, uint sidtype)
229{ 411{
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
383 spin_lock_init(&sidgidlock); 565 spin_lock_init(&sidgidlock);
384 gidtree = RB_ROOT; 566 gidtree = RB_ROOT;
385 567
568 spin_lock_init(&uidsidlock);
569 siduidtree = RB_ROOT;
570 spin_lock_init(&gidsidlock);
571 sidgidtree = RB_ROOT;
386 register_shrinker(&cifs_shrinker); 572 register_shrinker(&cifs_shrinker);
387 573
388 cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); 574 cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
422 while ((node = rb_first(root))) 608 while ((node = rb_first(root)))
423 rb_erase(node, root); 609 rb_erase(node, root);
424 spin_unlock(&sidgidlock); 610 spin_unlock(&sidgidlock);
611
612 root = &siduidtree;
613 spin_lock(&uidsidlock);
614 while ((node = rb_first(root)))
615 rb_erase(node, root);
616 spin_unlock(&uidsidlock);
617
618 root = &sidgidtree;
619 spin_lock(&gidsidlock);
620 while ((node = rb_first(root)))
621 rb_erase(node, root);
622 spin_unlock(&gidsidlock);
425} 623}
426 624
427/* if the two SIDs (roughly equivalent to a UUID for a user or group) are 625/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
706 acl_size = sizeof(struct cifs_acl); 904 acl_size = sizeof(struct cifs_acl);
707 905
708 num_aces = le32_to_cpu(pdacl->num_aces); 906 num_aces = le32_to_cpu(pdacl->num_aces);
709 if (num_aces > 0) { 907 if (num_aces > 0) {
710 umode_t user_mask = S_IRWXU; 908 umode_t user_mask = S_IRWXU;
711 umode_t group_mask = S_IRWXG; 909 umode_t group_mask = S_IRWXG;
712 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; 910 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
868 else 1066 else
869 cFYI(1, "no ACL"); /* BB grant all or default perms? */ 1067 cFYI(1, "no ACL"); /* BB grant all or default perms? */
870 1068
871/* cifscred->uid = owner_sid_ptr->rid;
872 cifscred->gid = group_sid_ptr->rid;
873 memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
874 sizeof(struct cifs_sid));
875 memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
876 sizeof(struct cifs_sid)); */
877
878 return rc; 1069 return rc;
879} 1070}
880 1071
881
882/* Convert permission bits from mode to equivalent CIFS ACL */ 1072/* Convert permission bits from mode to equivalent CIFS ACL */
883static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, 1073static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
884 struct inode *inode, __u64 nmode) 1074 __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
885{ 1075{
886 int rc = 0; 1076 int rc = 0;
887 __u32 dacloffset; 1077 __u32 dacloffset;
888 __u32 ndacloffset; 1078 __u32 ndacloffset;
889 __u32 sidsoffset; 1079 __u32 sidsoffset;
890 struct cifs_sid *owner_sid_ptr, *group_sid_ptr; 1080 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
1081 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
891 struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ 1082 struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
892 struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ 1083 struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
893 1084
894 if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) 1085 if (nmode != NO_CHANGE_64) { /* chmod */
895 return -EIO; 1086 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
896
897 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
898 le32_to_cpu(pntsd->osidoffset)); 1087 le32_to_cpu(pntsd->osidoffset));
899 group_sid_ptr = (struct cifs_sid *)((char *)pntsd + 1088 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
900 le32_to_cpu(pntsd->gsidoffset)); 1089 le32_to_cpu(pntsd->gsidoffset));
901 1090 dacloffset = le32_to_cpu(pntsd->dacloffset);
902 dacloffset = le32_to_cpu(pntsd->dacloffset); 1091 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
903 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); 1092 ndacloffset = sizeof(struct cifs_ntsd);
904 1093 ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
905 ndacloffset = sizeof(struct cifs_ntsd); 1094 ndacl_ptr->revision = dacl_ptr->revision;
906 ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); 1095 ndacl_ptr->size = 0;
907 ndacl_ptr->revision = dacl_ptr->revision; 1096 ndacl_ptr->num_aces = 0;
908 ndacl_ptr->size = 0; 1097
909 ndacl_ptr->num_aces = 0; 1098 rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
910 1099 nmode);
911 rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); 1100 sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
912 1101 /* copy sec desc control portion & owner and group sids */
913 sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); 1102 copy_sec_desc(pntsd, pnntsd, sidsoffset);
914 1103 *aclflag = CIFS_ACL_DACL;
915 /* copy security descriptor control portion and owner and group sid */ 1104 } else {
916 copy_sec_desc(pntsd, pnntsd, sidsoffset); 1105 memcpy(pnntsd, pntsd, secdesclen);
1106 if (uid != NO_CHANGE_32) { /* chown */
1107 owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
1108 le32_to_cpu(pnntsd->osidoffset));
1109 nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
1110 GFP_KERNEL);
1111 if (!nowner_sid_ptr)
1112 return -ENOMEM;
1113 rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
1114 if (rc) {
1115 cFYI(1, "%s: Mapping error %d for owner id %d",
1116 __func__, rc, uid);
1117 kfree(nowner_sid_ptr);
1118 return rc;
1119 }
1120 memcpy(owner_sid_ptr, nowner_sid_ptr,
1121 sizeof(struct cifs_sid));
1122 kfree(nowner_sid_ptr);
1123 *aclflag = CIFS_ACL_OWNER;
1124 }
1125 if (gid != NO_CHANGE_32) { /* chgrp */
1126 group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
1127 le32_to_cpu(pnntsd->gsidoffset));
1128 ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
1129 GFP_KERNEL);
1130 if (!ngroup_sid_ptr)
1131 return -ENOMEM;
1132 rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
1133 if (rc) {
1134 cFYI(1, "%s: Mapping error %d for group id %d",
1135 __func__, rc, gid);
1136 kfree(ngroup_sid_ptr);
1137 return rc;
1138 }
1139 memcpy(group_sid_ptr, ngroup_sid_ptr,
1140 sizeof(struct cifs_sid));
1141 kfree(ngroup_sid_ptr);
1142 *aclflag = CIFS_ACL_GROUP;
1143 }
1144 }
917 1145
918 return rc; 1146 return rc;
919} 1147}
@@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
945{ 1173{
946 struct cifs_ntsd *pntsd = NULL; 1174 struct cifs_ntsd *pntsd = NULL;
947 int oplock = 0; 1175 int oplock = 0;
948 int xid, rc; 1176 int xid, rc, create_options = 0;
949 __u16 fid; 1177 __u16 fid;
950 struct cifs_tcon *tcon; 1178 struct cifs_tcon *tcon;
951 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 1179 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
956 tcon = tlink_tcon(tlink); 1184 tcon = tlink_tcon(tlink);
957 xid = GetXid(); 1185 xid = GetXid();
958 1186
959 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, 1187 if (backup_cred(cifs_sb))
960 &fid, &oplock, NULL, cifs_sb->local_nls, 1188 create_options |= CREATE_OPEN_BACKUP_INTENT;
961 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1189
1190 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
1191 create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
1192 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
962 if (!rc) { 1193 if (!rc) {
963 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); 1194 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
964 CIFSSMBClose(xid, tcon, fid); 1195 CIFSSMBClose(xid, tcon, fid);
@@ -991,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
991 return pntsd; 1222 return pntsd;
992} 1223}
993 1224
994static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, 1225 /* Set an ACL on the server */
995 struct cifs_ntsd *pnntsd, u32 acllen) 1226int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
1227 struct inode *inode, const char *path, int aclflag)
996{ 1228{
997 int oplock = 0; 1229 int oplock = 0;
998 int xid, rc; 1230 int xid, rc, access_flags, create_options = 0;
999 __u16 fid; 1231 __u16 fid;
1000 struct cifs_tcon *tcon; 1232 struct cifs_tcon *tcon;
1233 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1001 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 1234 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
1002 1235
1003 if (IS_ERR(tlink)) 1236 if (IS_ERR(tlink))
@@ -1006,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
1006 tcon = tlink_tcon(tlink); 1239 tcon = tlink_tcon(tlink);
1007 xid = GetXid(); 1240 xid = GetXid();
1008 1241
1009 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, 1242 if (backup_cred(cifs_sb))
1010 &fid, &oplock, NULL, cifs_sb->local_nls, 1243 create_options |= CREATE_OPEN_BACKUP_INTENT;
1011 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1244
1245 if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
1246 access_flags = WRITE_OWNER;
1247 else
1248 access_flags = WRITE_DAC;
1249
1250 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
1251 create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
1252 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1012 if (rc) { 1253 if (rc) {
1013 cERROR(1, "Unable to open file to set ACL"); 1254 cERROR(1, "Unable to open file to set ACL");
1014 goto out; 1255 goto out;
1015 } 1256 }
1016 1257
1017 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); 1258 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
1018 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 1259 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
1019 1260
1020 CIFSSMBClose(xid, tcon, fid); 1261 CIFSSMBClose(xid, tcon, fid);
@@ -1024,17 +1265,6 @@ out:
1024 return rc; 1265 return rc;
1025} 1266}
1026 1267
1027/* Set an ACL on the server */
1028int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
1029 struct inode *inode, const char *path)
1030{
1031 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1032
1033 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
1034
1035 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
1036}
1037
1038/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 1268/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
1039int 1269int
1040cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 1270cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1066,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
1066} 1296}
1067 1297
1068/* Convert mode bits to an ACL so we can update the ACL on the server */ 1298/* Convert mode bits to an ACL so we can update the ACL on the server */
1069int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) 1299int
1300id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1301 uid_t uid, gid_t gid)
1070{ 1302{
1071 int rc = 0; 1303 int rc = 0;
1304 int aclflag = CIFS_ACL_DACL; /* default flag to set */
1072 __u32 secdesclen = 0; 1305 __u32 secdesclen = 0;
1073 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 1306 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
1074 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 1307 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1098,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
1098 return -ENOMEM; 1331 return -ENOMEM;
1099 } 1332 }
1100 1333
1101 rc = build_sec_desc(pntsd, pnntsd, inode, nmode); 1334 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
1335 &aclflag);
1102 1336
1103 cFYI(DBG2, "build_sec_desc rc: %d", rc); 1337 cFYI(DBG2, "build_sec_desc rc: %d", rc);
1104 1338
1105 if (!rc) { 1339 if (!rc) {
1106 /* Set the security descriptor */ 1340 /* Set the security descriptor */
1107 rc = set_cifs_acl(pnntsd, secdesclen, inode, path); 1341 rc = set_cifs_acl(pnntsd, secdesclen, inode,
1342 path, aclflag);
1108 cFYI(DBG2, "set_cifs_acl rc: %d", rc); 1343 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1109 } 1344 }
1110 1345
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 30acd22147e..5d9b9acc5fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
37 * the sequence number before this function is called. Also, this function 37 * the sequence number before this function is called. Also, this function
38 * should be called with the server->srv_mutex held. 38 * should be called with the server->srv_mutex held.
39 */ 39 */
40static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 40static int cifs_calc_signature(const struct kvec *iov, int n_vec,
41 struct TCP_Server_Info *server, char *signature) 41 struct TCP_Server_Info *server, char *signature)
42{
43 int rc;
44
45 if (cifs_pdu == NULL || signature == NULL || server == NULL)
46 return -EINVAL;
47
48 if (!server->secmech.sdescmd5) {
49 cERROR(1, "%s: Can't generate signature\n", __func__);
50 return -1;
51 }
52
53 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
54 if (rc) {
55 cERROR(1, "%s: Could not init md5\n", __func__);
56 return rc;
57 }
58
59 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
60 server->session_key.response, server->session_key.len);
61 if (rc) {
62 cERROR(1, "%s: Could not update with response\n", __func__);
63 return rc;
64 }
65
66 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
67 cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
68 if (rc) {
69 cERROR(1, "%s: Could not update with payload\n", __func__);
70 return rc;
71 }
72
73 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
74 if (rc)
75 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
76
77 return rc;
78}
79
80/* must be called with server->srv_mutex held */
81int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
82 __u32 *pexpected_response_sequence_number)
83{
84 int rc = 0;
85 char smb_signature[20];
86
87 if ((cifs_pdu == NULL) || (server == NULL))
88 return -EINVAL;
89
90 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
91 server->tcpStatus == CifsNeedNegotiate)
92 return rc;
93
94 if (!server->session_estab) {
95 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
96 return rc;
97 }
98
99 cifs_pdu->Signature.Sequence.SequenceNumber =
100 cpu_to_le32(server->sequence_number);
101 cifs_pdu->Signature.Sequence.Reserved = 0;
102
103 *pexpected_response_sequence_number = server->sequence_number++;
104 server->sequence_number++;
105
106 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
107 if (rc)
108 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
109 else
110 memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
111
112 return rc;
113}
114
115static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
116 struct TCP_Server_Info *server, char *signature)
117{ 42{
118 int i; 43 int i;
119 int rc; 44 int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
179{ 104{
180 int rc = 0; 105 int rc = 0;
181 char smb_signature[20]; 106 char smb_signature[20];
182 struct smb_hdr *cifs_pdu = iov[0].iov_base; 107 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
183 108
184 if ((cifs_pdu == NULL) || (server == NULL)) 109 if ((cifs_pdu == NULL) || (server == NULL))
185 return -EINVAL; 110 return -EINVAL;
@@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
189 return rc; 114 return rc;
190 115
191 if (!server->session_estab) { 116 if (!server->session_estab) {
192 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); 117 memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
193 return rc; 118 return rc;
194 } 119 }
195 120
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
200 *pexpected_response_sequence_number = server->sequence_number++; 125 *pexpected_response_sequence_number = server->sequence_number++;
201 server->sequence_number++; 126 server->sequence_number++;
202 127
203 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); 128 rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
204 if (rc) 129 if (rc)
205 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 130 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
206 else 131 else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
209 return rc; 134 return rc;
210} 135}
211 136
212int cifs_verify_signature(struct smb_hdr *cifs_pdu, 137/* must be called with server->srv_mutex held */
138int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
139 __u32 *pexpected_response_sequence_number)
140{
141 struct kvec iov;
142
143 iov.iov_base = cifs_pdu;
144 iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
145
146 return cifs_sign_smb2(&iov, 1, server,
147 pexpected_response_sequence_number);
148}
149
150int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
213 struct TCP_Server_Info *server, 151 struct TCP_Server_Info *server,
214 __u32 expected_sequence_number) 152 __u32 expected_sequence_number)
215{ 153{
216 unsigned int rc; 154 unsigned int rc;
217 char server_response_sig[8]; 155 char server_response_sig[8];
218 char what_we_think_sig_should_be[20]; 156 char what_we_think_sig_should_be[20];
157 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
219 158
220 if (cifs_pdu == NULL || server == NULL) 159 if (cifs_pdu == NULL || server == NULL)
221 return -EINVAL; 160 return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
247 cifs_pdu->Signature.Sequence.Reserved = 0; 186 cifs_pdu->Signature.Sequence.Reserved = 0;
248 187
249 mutex_lock(&server->srv_mutex); 188 mutex_lock(&server->srv_mutex);
250 rc = cifs_calculate_signature(cifs_pdu, server, 189 rc = cifs_calc_signature(iov, nr_iov, server,
251 what_we_think_sig_should_be); 190 what_we_think_sig_should_be);
252 mutex_unlock(&server->srv_mutex); 191 mutex_unlock(&server->srv_mutex);
253 192
254 if (rc) 193 if (rc)
@@ -265,7 +204,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
265} 204}
266 205
267/* first calculate 24 bytes ntlm response and then 16 byte session key */ 206/* first calculate 24 bytes ntlm response and then 16 byte session key */
268int setup_ntlm_response(struct cifs_ses *ses) 207int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
269{ 208{
270 int rc = 0; 209 int rc = 0;
271 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; 210 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -282,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
282 ses->auth_key.len = temp_len; 221 ses->auth_key.len = temp_len;
283 222
284 rc = SMBNTencrypt(ses->password, ses->server->cryptkey, 223 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
285 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 224 ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
286 if (rc) { 225 if (rc) {
287 cFYI(1, "%s Can't generate NTLM response, error: %d", 226 cFYI(1, "%s Can't generate NTLM response, error: %d",
288 __func__, rc); 227 __func__, rc);
289 return rc; 228 return rc;
290 } 229 }
291 230
292 rc = E_md4hash(ses->password, temp_key); 231 rc = E_md4hash(ses->password, temp_key, nls_cp);
293 if (rc) { 232 if (rc) {
294 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); 233 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
295 return rc; 234 return rc;
@@ -465,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
465 } 404 }
466 405
467 /* calculate md4 hash of password */ 406 /* calculate md4 hash of password */
468 E_md4hash(ses->password, nt_hash); 407 E_md4hash(ses->password, nt_hash, nls_cp);
469 408
470 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, 409 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
471 CIFS_NTHASH_SIZE); 410 CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54b8f1e7da9..8f1fe324162 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
53int cifsFYI = 0; 53int cifsFYI = 0;
54int cifsERROR = 1; 54int cifsERROR = 1;
55int traceSMB = 0; 55int traceSMB = 0;
56unsigned int oplockEnabled = 1; 56bool enable_oplocks = true;
57unsigned int linuxExtEnabled = 1; 57unsigned int linuxExtEnabled = 1;
58unsigned int lookupCacheEnabled = 1; 58unsigned int lookupCacheEnabled = 1;
59unsigned int multiuser_mount = 0; 59unsigned int multiuser_mount = 0;
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
74MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 74MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
75 "Range: 2 to 256"); 75 "Range: 2 to 256");
76unsigned int cifs_max_pending = CIFS_MAX_REQ; 76unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0444);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80unsigned short echo_retries = 5; 80unsigned short echo_retries = 5;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " 82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means " 83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect."); 84 "never reconnect.");
85module_param(enable_oplocks, bool, 0644);
86MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
87 "y/Y/1");
88
85extern mempool_t *cifs_sm_req_poolp; 89extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 90extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 91extern mempool_t *cifs_mid_poolp;
@@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb)
132 else 136 else
133 sb->s_d_op = &cifs_dentry_ops; 137 sb->s_d_op = &cifs_dentry_ops;
134 138
135#ifdef CIFS_NFSD_EXPORT 139#ifdef CONFIG_CIFS_NFSD_EXPORT
136 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 140 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
137 cFYI(1, "export ops supported"); 141 cFYI(1, "export ops supported");
138 sb->s_export_op = &cifs_export_ops; 142 sb->s_export_op = &cifs_export_ops;
139 } 143 }
140#endif /* CIFS_NFSD_EXPORT */ 144#endif /* CONFIG_CIFS_NFSD_EXPORT */
141 145
142 return 0; 146 return 0;
143 147
@@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
432 seq_printf(s, ",mfsymlinks"); 436 seq_printf(s, ",mfsymlinks");
433 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) 437 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
434 seq_printf(s, ",fsc"); 438 seq_printf(s, ",fsc");
439 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
440 seq_printf(s, ",nostrictsync");
441 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
442 seq_printf(s, ",noperm");
443 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
444 seq_printf(s, ",strictcache");
435 445
436 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 446 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
437 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 447 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
530 char *full_path = NULL; 540 char *full_path = NULL;
531 char *s, *p; 541 char *s, *p;
532 char sep; 542 char sep;
533 int xid;
534 543
535 full_path = cifs_build_path_to_root(vol, cifs_sb, 544 full_path = cifs_build_path_to_root(vol, cifs_sb,
536 cifs_sb_master_tcon(cifs_sb)); 545 cifs_sb_master_tcon(cifs_sb));
@@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
539 548
540 cFYI(1, "Get root dentry for %s", full_path); 549 cFYI(1, "Get root dentry for %s", full_path);
541 550
542 xid = GetXid();
543 sep = CIFS_DIR_SEP(cifs_sb); 551 sep = CIFS_DIR_SEP(cifs_sb);
544 dentry = dget(sb->s_root); 552 dentry = dget(sb->s_root);
545 p = s = full_path; 553 p = s = full_path;
@@ -570,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
570 dput(dentry); 578 dput(dentry);
571 dentry = child; 579 dentry = child;
572 } while (!IS_ERR(dentry)); 580 } while (!IS_ERR(dentry));
573 _FreeXid(xid);
574 kfree(full_path); 581 kfree(full_path);
575 return dentry; 582 return dentry;
576} 583}
@@ -723,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
723 if (rc < 0) 730 if (rc < 0)
724 return (loff_t)rc; 731 return (loff_t)rc;
725 } 732 }
726 return generic_file_llseek_unlocked(file, offset, origin); 733 return generic_file_llseek(file, offset, origin);
727} 734}
728 735
729static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 736static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -942,7 +949,8 @@ cifs_init_once(void *inode)
942 struct cifsInodeInfo *cifsi = inode; 949 struct cifsInodeInfo *cifsi = inode;
943 950
944 inode_init_once(&cifsi->vfs_inode); 951 inode_init_once(&cifsi->vfs_inode);
945 INIT_LIST_HEAD(&cifsi->lockList); 952 INIT_LIST_HEAD(&cifsi->llist);
953 mutex_init(&cifsi->lock_mutex);
946} 954}
947 955
948static int 956static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 95da8027983..30ff56005d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
121extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 121extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
122extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); 122extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
123 123
124#ifdef CIFS_NFSD_EXPORT 124#ifdef CONFIG_CIFS_NFSD_EXPORT
125extern const struct export_operations cifs_export_ops; 125extern const struct export_operations cifs_export_ops;
126#endif /* CIFS_NFSD_EXPORT */ 126#endif /* CONFIG_CIFS_NFSD_EXPORT */
127 127
128#define CIFS_VERSION "1.75" 128#define CIFS_VERSION "1.76"
129#endif /* _CIFSFS_H */ 129#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 95dad9d14cf..8238aa13e01 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
167 uid_t cred_uid; 167 uid_t cred_uid;
168 uid_t linux_uid; 168 uid_t linux_uid;
169 gid_t linux_gid; 169 gid_t linux_gid;
170 uid_t backupuid;
171 gid_t backupgid;
170 mode_t file_mode; 172 mode_t file_mode;
171 mode_t dir_mode; 173 mode_t dir_mode;
172 unsigned secFlg; 174 unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
179 bool noperm:1; 181 bool noperm:1;
180 bool no_psx_acl:1; /* set if posix acl support should be disabled */ 182 bool no_psx_acl:1; /* set if posix acl support should be disabled */
181 bool cifs_acl:1; 183 bool cifs_acl:1;
184 bool backupuid_specified; /* mount option backupuid is specified */
185 bool backupgid_specified; /* mount option backupgid is specified */
182 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 186 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
183 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 187 bool server_ino:1; /* use inode numbers from server ie UniqueId */
184 bool direct_io:1; 188 bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
219 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ 223 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
220 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ 224 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
221 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ 225 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
222 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) 226 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
227 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
223 228
224#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ 229#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
225 MS_NODEV | MS_SYNCHRONOUS) 230 MS_NODEV | MS_SYNCHRONOUS)
@@ -286,7 +291,13 @@ struct TCP_Server_Info {
286 bool sec_kerberosu2u; /* supports U2U Kerberos */ 291 bool sec_kerberosu2u; /* supports U2U Kerberos */
287 bool sec_kerberos; /* supports plain Kerberos */ 292 bool sec_kerberos; /* supports plain Kerberos */
288 bool sec_mskerberos; /* supports legacy MS Kerberos */ 293 bool sec_mskerberos; /* supports legacy MS Kerberos */
294 bool large_buf; /* is current buffer large? */
289 struct delayed_work echo; /* echo ping workqueue job */ 295 struct delayed_work echo; /* echo ping workqueue job */
296 struct kvec *iov; /* reusable kvec array for receives */
297 unsigned int nr_iov; /* number of kvecs in array */
298 char *smallbuf; /* pointer to current "small" buffer */
299 char *bigbuf; /* pointer to current "big" buffer */
300 unsigned int total_read; /* total amount of data read in this pass */
290#ifdef CONFIG_CIFS_FSCACHE 301#ifdef CONFIG_CIFS_FSCACHE
291 struct fscache_cookie *fscache; /* client index cache cookie */ 302 struct fscache_cookie *fscache; /* client index cache cookie */
292#endif 303#endif
@@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
485 */ 496 */
486struct cifsLockInfo { 497struct cifsLockInfo {
487 struct list_head llist; /* pointer to next cifsLockInfo */ 498 struct list_head llist; /* pointer to next cifsLockInfo */
499 struct list_head blist; /* pointer to locks blocked on this */
500 wait_queue_head_t block_q;
488 __u64 offset; 501 __u64 offset;
489 __u64 length; 502 __u64 length;
503 __u32 pid;
490 __u8 type; 504 __u8 type;
505 __u16 netfid;
491}; 506};
492 507
493/* 508/*
@@ -520,8 +535,6 @@ struct cifsFileInfo {
520 struct dentry *dentry; 535 struct dentry *dentry;
521 unsigned int f_flags; 536 unsigned int f_flags;
522 struct tcon_link *tlink; 537 struct tcon_link *tlink;
523 struct mutex lock_mutex;
524 struct list_head llist; /* list of byte range locks we have. */
525 bool invalidHandle:1; /* file closed via session abend */ 538 bool invalidHandle:1; /* file closed via session abend */
526 bool oplock_break_cancelled:1; 539 bool oplock_break_cancelled:1;
527 int count; /* refcount protected by cifs_file_list_lock */ 540 int count; /* refcount protected by cifs_file_list_lock */
@@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
554 */ 567 */
555 568
556struct cifsInodeInfo { 569struct cifsInodeInfo {
557 struct list_head lockList; 570 struct list_head llist; /* brlocks for this inode */
571 bool can_cache_brlcks;
572 struct mutex lock_mutex; /* protect two fields above */
558 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 573 /* BB add in lists for dirty pages i.e. write caching info for oplock */
559 struct list_head openFileList; 574 struct list_head openFileList;
560 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 575 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
643struct mid_q_entry; 658struct mid_q_entry;
644 659
645/* 660/*
646 * This is the prototype for the mid callback function. When creating one, 661 * This is the prototype for the mid receive function. This function is for
647 * take special care to avoid deadlocks. Things to bear in mind: 662 * receiving the rest of the SMB frame, starting with the WordCount (which is
663 * just after the MID in struct smb_hdr). Note:
664 *
665 * - This will be called by cifsd, with no locks held.
666 * - The mid will still be on the pending_mid_q.
667 * - mid->resp_buf will point to the current buffer.
668 *
669 * Returns zero on a successful receive, or an error. The receive state in
670 * the TCP_Server_Info will also be updated.
671 */
672typedef int (mid_receive_t)(struct TCP_Server_Info *server,
673 struct mid_q_entry *mid);
674
675/*
676 * This is the prototype for the mid callback function. This is called once the
677 * mid has been received off of the socket. When creating one, take special
678 * care to avoid deadlocks. Things to bear in mind:
648 * 679 *
649 * - it will be called by cifsd, with no locks held 680 * - it will be called by cifsd, with no locks held
650 * - the mid will be removed from any lists 681 * - the mid will be removed from any lists
@@ -662,9 +693,10 @@ struct mid_q_entry {
662 unsigned long when_sent; /* time when smb send finished */ 693 unsigned long when_sent; /* time when smb send finished */
663 unsigned long when_received; /* when demux complete (taken off wire) */ 694 unsigned long when_received; /* when demux complete (taken off wire) */
664#endif 695#endif
696 mid_receive_t *receive; /* call receive callback */
665 mid_callback_t *callback; /* call completion callback */ 697 mid_callback_t *callback; /* call completion callback */
666 void *callback_data; /* general purpose pointer for callback */ 698 void *callback_data; /* general purpose pointer for callback */
667 struct smb_hdr *resp_buf; /* response buffer */ 699 struct smb_hdr *resp_buf; /* pointer to received SMB header */
668 int midState; /* wish this were enum but can not pass to wait_event */ 700 int midState; /* wish this were enum but can not pass to wait_event */
669 __u8 command; /* smb command code */ 701 __u8 command; /* smb command code */
670 bool largeBuf:1; /* if valid response, is pointer to large buf */ 702 bool largeBuf:1; /* if valid response, is pointer to large buf */
@@ -964,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
964 to be established on existing mount if we 996 to be established on existing mount if we
965 have the uid/password or Kerberos credential 997 have the uid/password or Kerberos credential
966 or equivalent for current user */ 998 or equivalent for current user */
967GLOBAL_EXTERN unsigned int oplockEnabled; 999/* enable or disable oplocks */
1000GLOBAL_EXTERN bool enable_oplocks;
968GLOBAL_EXTERN unsigned int lookupCacheEnabled; 1001GLOBAL_EXTERN unsigned int lookupCacheEnabled;
969GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent 1002GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
970 with more secure ntlmssp2 challenge/resp */ 1003 with more secure ntlmssp2 challenge/resp */
@@ -978,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
978/* reconnect after this many failed echo attempts */ 1011/* reconnect after this many failed echo attempts */
979GLOBAL_EXTERN unsigned short echo_retries; 1012GLOBAL_EXTERN unsigned short echo_retries;
980 1013
1014#ifdef CONFIG_CIFS_ACL
981GLOBAL_EXTERN struct rb_root uidtree; 1015GLOBAL_EXTERN struct rb_root uidtree;
982GLOBAL_EXTERN struct rb_root gidtree; 1016GLOBAL_EXTERN struct rb_root gidtree;
983GLOBAL_EXTERN spinlock_t siduidlock; 1017GLOBAL_EXTERN spinlock_t siduidlock;
984GLOBAL_EXTERN spinlock_t sidgidlock; 1018GLOBAL_EXTERN spinlock_t sidgidlock;
1019GLOBAL_EXTERN struct rb_root siduidtree;
1020GLOBAL_EXTERN struct rb_root sidgidtree;
1021GLOBAL_EXTERN spinlock_t uidsidlock;
1022GLOBAL_EXTERN spinlock_t gidsidlock;
1023#endif /* CONFIG_CIFS_ACL */
985 1024
986void cifs_oplock_break(struct work_struct *work); 1025void cifs_oplock_break(struct work_struct *work);
987 1026
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de0..3fb03e2c8e8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
1089 __le16 DataLengthHigh; 1089 __le16 DataLengthHigh;
1090 __u64 Reserved2; 1090 __u64 Reserved2;
1091 __u16 ByteCount; 1091 __u16 ByteCount;
1092 __u8 Pad; /* BB check for whether padded to DWORD 1092 /* read response data immediately follows */
1093 boundary and optimum performance here */
1094 char Data[1];
1095} __attribute__((packed)) READ_RSP; 1093} __attribute__((packed)) READ_RSP;
1096 1094
1097typedef struct locking_andx_range { 1095typedef struct locking_andx_range {
@@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
1913 1911
1914/* SETFSInfo Levels */ 1912/* SETFSInfo Levels */
1915#define SMB_SET_CIFS_UNIX_INFO 0x200 1913#define SMB_SET_CIFS_UNIX_INFO 0x200
1914/* level 0x203 is defined above in list of QFS info levels */
1915/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
1916
1917/* Level 0x200 request structure follows */
1916typedef struct smb_com_transaction2_setfsi_req { 1918typedef struct smb_com_transaction2_setfsi_req {
1917 struct smb_hdr hdr; /* wct = 15 */ 1919 struct smb_hdr hdr; /* wct = 15 */
1918 __le16 TotalParameterCount; 1920 __le16 TotalParameterCount;
@@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
1940 __le64 ClientUnixCap; /* Data end */ 1942 __le64 ClientUnixCap; /* Data end */
1941} __attribute__((packed)) TRANSACTION2_SETFSI_REQ; 1943} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
1942 1944
1945/* level 0x203 request structure follows */
1946typedef struct smb_com_transaction2_setfs_enc_req {
1947 struct smb_hdr hdr; /* wct = 15 */
1948 __le16 TotalParameterCount;
1949 __le16 TotalDataCount;
1950 __le16 MaxParameterCount;
1951 __le16 MaxDataCount;
1952 __u8 MaxSetupCount;
1953 __u8 Reserved;
1954 __le16 Flags;
1955 __le32 Timeout;
1956 __u16 Reserved2;
1957 __le16 ParameterCount; /* 4 */
1958 __le16 ParameterOffset;
1959 __le16 DataCount; /* 12 */
1960 __le16 DataOffset;
1961 __u8 SetupCount; /* one */
1962 __u8 Reserved3;
1963 __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */
1964 __le16 ByteCount;
1965 __u8 Pad;
1966 __u16 Reserved4; /* Parameters start. */
1967 __le16 InformationLevel;/* Parameters end. */
1968 /* NTLMSSP Blob, Data start. */
1969} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
1970
1971/* response for setfsinfo levels 0x200 and 0x203 */
1943typedef struct smb_com_transaction2_setfsi_rsp { 1972typedef struct smb_com_transaction2_setfsi_rsp {
1944 struct smb_hdr hdr; /* wct = 10 */ 1973 struct smb_hdr hdr; /* wct = 10 */
1945 struct trans2_resp t2; 1974 struct trans2_resp t2;
1946 __u16 ByteCount; 1975 __u16 ByteCount;
1947} __attribute__((packed)) TRANSACTION2_SETFSI_RSP; 1976} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
1948 1977
1949
1950typedef struct smb_com_transaction2_get_dfs_refer_req { 1978typedef struct smb_com_transaction2_get_dfs_refer_req {
1951 struct smb_hdr hdr; /* wct = 15 */ 1979 struct smb_hdr hdr; /* wct = 15 */
1952 __le16 TotalParameterCount; 1980 __le16 TotalParameterCount;
@@ -2098,13 +2126,13 @@ typedef struct {
2098#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and 2126#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
2099 QFS PROXY call */ 2127 QFS PROXY call */
2100#ifdef CONFIG_CIFS_POSIX 2128#ifdef CONFIG_CIFS_POSIX
2101/* Can not set pathnames cap yet until we send new posix create SMB since 2129/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
2102 otherwise server can treat such handles opened with older ntcreatex 2130 LockingX instead of posix locking call on unix sess (and we do not expect
2103 (by a new client which knows how to send posix path ops) 2131 LockingX to use different (ie Windows) semantics than posix locking on
2104 as non-posix handles (can affect write behavior with byte range locks. 2132 the same session (if WINE needs to do this later, we can add this cap
2105 We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ 2133 back in later */
2106/* #define CIFS_UNIX_CAP_MASK 0x000000fb */ 2134/* #define CIFS_UNIX_CAP_MASK 0x000000fb */
2107#define CIFS_UNIX_CAP_MASK 0x000000db 2135#define CIFS_UNIX_CAP_MASK 0x000003db
2108#else 2136#else
2109#define CIFS_UNIX_CAP_MASK 0x00000013 2137#define CIFS_UNIX_CAP_MASK 0x00000013
2110#endif /* CONFIG_CIFS_POSIX */ 2138#endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5..6f4e243e0f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
69 struct TCP_Server_Info *server); 69 struct TCP_Server_Info *server);
70extern void DeleteMidQEntry(struct mid_q_entry *midEntry); 70extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
71extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 71extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
72 unsigned int nvec, mid_callback_t *callback, 72 unsigned int nvec, mid_receive_t *receive,
73 void *cbdata, bool ignore_pend); 73 mid_callback_t *callback, void *cbdata,
74 bool ignore_pend);
74extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, 75extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
75 struct smb_hdr * /* input */ , 76 struct smb_hdr * /* input */ ,
76 struct smb_hdr * /* out */ , 77 struct smb_hdr * /* out */ ,
@@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
90extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); 91extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
91extern bool is_valid_oplock_break(struct smb_hdr *smb, 92extern bool is_valid_oplock_break(struct smb_hdr *smb,
92 struct TCP_Server_Info *); 93 struct TCP_Server_Info *);
94extern bool backup_cred(struct cifs_sb_info *);
93extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 95extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
94extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 96extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
95 unsigned int bytes_written); 97 unsigned int bytes_written);
@@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
145extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 147extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
146 struct cifs_fattr *fattr, struct inode *inode, 148 struct cifs_fattr *fattr, struct inode *inode,
147 const char *path, const __u16 *pfid); 149 const char *path, const __u16 *pfid);
148extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); 150extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
151 uid_t, gid_t);
149extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, 152extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
150 const char *, u32 *); 153 const char *, u32 *);
151extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, 154extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
152 const char *); 155 const char *, int);
153 156
157extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
158extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
159 unsigned int to_read);
160extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
161 struct kvec *iov_orig, unsigned int nr_segs,
162 unsigned int to_read);
154extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, 163extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
155 struct cifs_sb_info *cifs_sb); 164 struct cifs_sb_info *cifs_sb);
156extern int cifs_match_super(struct super_block *, void *); 165extern int cifs_match_super(struct super_block *, void *);
@@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
359 const struct nls_table *nls_codepage, 368 const struct nls_table *nls_codepage,
360 int remap_special_chars); 369 int remap_special_chars);
361 370
371extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
372 const __u8 lock_type, const __u32 num_unlock,
373 const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
362extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, 374extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
363 const __u16 netfid, const __u64 len, 375 const __u16 netfid, const __u32 netpid, const __u64 len,
364 const __u64 offset, const __u32 numUnlock, 376 const __u64 offset, const __u32 numUnlock,
365 const __u32 numLock, const __u8 lockType, 377 const __u32 numLock, const __u8 lockType,
366 const bool waitFlag, const __u8 oplock_level); 378 const bool waitFlag, const __u8 oplock_level);
367extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, 379extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
368 const __u16 smb_file_id, const int get_flag, 380 const __u16 smb_file_id, const __u32 netpid,
369 const __u64 len, struct file_lock *, 381 const int get_flag, const __u64 len, struct file_lock *,
370 const __u16 lock_type, const bool waitFlag); 382 const __u16 lock_type, const bool waitFlag);
371extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); 383extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
372extern int CIFSSMBEcho(struct TCP_Server_Info *server); 384extern int CIFSSMBEcho(struct TCP_Server_Info *server);
@@ -380,11 +392,12 @@ extern void tconInfoFree(struct cifs_tcon *);
380extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); 392extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
381extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 393extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
382 __u32 *); 394 __u32 *);
383extern int cifs_verify_signature(struct smb_hdr *, 395extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
384 struct TCP_Server_Info *server, 396 struct TCP_Server_Info *server,
385 __u32 expected_sequence_number); 397 __u32 expected_sequence_number);
386extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 398extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
387extern int setup_ntlm_response(struct cifs_ses *); 399 const struct nls_table *);
400extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
388extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); 401extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
389extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 402extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
390extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 403extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -419,7 +432,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
419extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, 432extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
420 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); 433 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
421extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, 434extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
422 struct cifs_ntsd *, __u32); 435 struct cifs_ntsd *, __u32, int);
423extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, 436extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
424 const unsigned char *searchName, 437 const unsigned char *searchName,
425 char *acl_inf, const int buflen, const int acl_type, 438 char *acl_inf, const int buflen, const int acl_type,
@@ -436,10 +449,29 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
436 const unsigned char *path, 449 const unsigned char *path,
437 struct cifs_sb_info *cifs_sb, int xid); 450 struct cifs_sb_info *cifs_sb, int xid);
438extern int mdfour(unsigned char *, unsigned char *, int); 451extern int mdfour(unsigned char *, unsigned char *, int);
439extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); 452extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
453 const struct nls_table *codepage);
440extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, 454extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
441 unsigned char *p24); 455 unsigned char *p24);
442 456
457/* asynchronous read support */
458struct cifs_readdata {
459 struct cifsFileInfo *cfile;
460 struct address_space *mapping;
461 __u64 offset;
462 unsigned int bytes;
463 pid_t pid;
464 int result;
465 struct list_head pages;
466 struct work_struct work;
467 unsigned int nr_iov;
468 struct kvec iov[1];
469};
470
471struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
472void cifs_readdata_free(struct cifs_readdata *rdata);
473int cifs_async_readv(struct cifs_readdata *rdata);
474
443/* asynchronous write support */ 475/* asynchronous write support */
444struct cifs_writedata { 476struct cifs_writedata {
445 struct kref refcount; 477 struct kref refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a80f7bd97b9..6600aa2d2ef 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/swap.h>
37#include <linux/task_io_accounting_ops.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37#include "cifspdu.h" 39#include "cifspdu.h"
38#include "cifsglob.h" 40#include "cifsglob.h"
@@ -40,6 +42,7 @@
40#include "cifsproto.h" 42#include "cifsproto.h"
41#include "cifs_unicode.h" 43#include "cifs_unicode.h"
42#include "cifs_debug.h" 44#include "cifs_debug.h"
45#include "fscache.h"
43 46
44#ifdef CONFIG_CIFS_POSIX 47#ifdef CONFIG_CIFS_POSIX
45static struct { 48static struct {
@@ -83,6 +86,9 @@ static struct {
83#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 86#endif /* CONFIG_CIFS_WEAK_PW_HASH */
84#endif /* CIFS_POSIX */ 87#endif /* CIFS_POSIX */
85 88
89/* Forward declarations */
90static void cifs_readv_complete(struct work_struct *work);
91
86/* Mark as invalid, all open files on tree connections since they 92/* Mark as invalid, all open files on tree connections since they
87 were closed when session to server was lost */ 93 were closed when session to server was lost */
88static void mark_open_files_invalid(struct cifs_tcon *pTcon) 94static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
453 } 459 }
454 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); 460 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
455 server->maxReq = le16_to_cpu(rsp->MaxMpxCount); 461 server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
456 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 462 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
457 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
458 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 463 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
459 /* even though we do not use raw we might as well set this 464 /* even though we do not use raw we might as well set this
460 accurately, in case we ever find a need for it */ 465 accurately, in case we ever find a need for it */
@@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
561 little endian */ 566 little endian */
562 server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); 567 server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
563 /* probably no need to store and check maxvcs */ 568 /* probably no need to store and check maxvcs */
564 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 569 server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
565 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
566 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
567 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
568 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 572 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
739 iov.iov_base = smb; 743 iov.iov_base = smb;
740 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; 744 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
741 745
742 rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); 746 rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
747 server, true);
743 if (rc) 748 if (rc)
744 cFYI(1, "Echo request failed: %d", rc); 749 cFYI(1, "Echo request failed: %d", rc);
745 750
@@ -1376,6 +1381,359 @@ openRetry:
1376 return rc; 1381 return rc;
1377} 1382}
1378 1383
1384struct cifs_readdata *
1385cifs_readdata_alloc(unsigned int nr_pages)
1386{
1387 struct cifs_readdata *rdata;
1388
1389 /* readdata + 1 kvec for each page */
1390 rdata = kzalloc(sizeof(*rdata) +
1391 sizeof(struct kvec) * nr_pages, GFP_KERNEL);
1392 if (rdata != NULL) {
1393 INIT_WORK(&rdata->work, cifs_readv_complete);
1394 INIT_LIST_HEAD(&rdata->pages);
1395 }
1396 return rdata;
1397}
1398
1399void
1400cifs_readdata_free(struct cifs_readdata *rdata)
1401{
1402 cifsFileInfo_put(rdata->cfile);
1403 kfree(rdata);
1404}
1405
1406/*
1407 * Discard any remaining data in the current SMB. To do this, we borrow the
1408 * current bigbuf.
1409 */
1410static int
1411cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1412{
1413 READ_RSP *rsp = (READ_RSP *)server->smallbuf;
1414 unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
1415 int remaining = rfclen + 4 - server->total_read;
1416 struct cifs_readdata *rdata = mid->callback_data;
1417
1418 while (remaining > 0) {
1419 int length;
1420
1421 length = cifs_read_from_socket(server, server->bigbuf,
1422 min_t(unsigned int, remaining,
1423 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
1424 if (length < 0)
1425 return length;
1426 server->total_read += length;
1427 remaining -= length;
1428 }
1429
1430 dequeue_mid(mid, rdata->result);
1431 return 0;
1432}
1433
1434static int
1435cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1436{
1437 int length, len;
1438 unsigned int data_offset, remaining, data_len;
1439 struct cifs_readdata *rdata = mid->callback_data;
1440 READ_RSP *rsp = (READ_RSP *)server->smallbuf;
1441 unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
1442 u64 eof;
1443 pgoff_t eof_index;
1444 struct page *page, *tpage;
1445
1446 cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
1447 mid->mid, rdata->offset, rdata->bytes);
1448
1449 /*
1450 * read the rest of READ_RSP header (sans Data array), or whatever we
1451 * can if there's not enough data. At this point, we've read down to
1452 * the Mid.
1453 */
1454 len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
1455 sizeof(struct smb_hdr) + 1;
1456
1457 rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
1458 rdata->iov[0].iov_len = len;
1459
1460 length = cifs_readv_from_socket(server, rdata->iov, 1, len);
1461 if (length < 0)
1462 return length;
1463 server->total_read += length;
1464
1465 /* Was the SMB read successful? */
1466 rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
1467 if (rdata->result != 0) {
1468 cFYI(1, "%s: server returned error %d", __func__,
1469 rdata->result);
1470 return cifs_readv_discard(server, mid);
1471 }
1472
1473 /* Is there enough to get to the rest of the READ_RSP header? */
1474 if (server->total_read < sizeof(READ_RSP)) {
1475 cFYI(1, "%s: server returned short header. got=%u expected=%zu",
1476 __func__, server->total_read, sizeof(READ_RSP));
1477 rdata->result = -EIO;
1478 return cifs_readv_discard(server, mid);
1479 }
1480
1481 data_offset = le16_to_cpu(rsp->DataOffset) + 4;
1482 if (data_offset < server->total_read) {
1483 /*
1484 * win2k8 sometimes sends an offset of 0 when the read
1485 * is beyond the EOF. Treat it as if the data starts just after
1486 * the header.
1487 */
1488 cFYI(1, "%s: data offset (%u) inside read response header",
1489 __func__, data_offset);
1490 data_offset = server->total_read;
1491 } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
1492 /* data_offset is beyond the end of smallbuf */
1493 cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
1494 __func__, data_offset);
1495 rdata->result = -EIO;
1496 return cifs_readv_discard(server, mid);
1497 }
1498
1499 cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
1500 server->total_read, data_offset);
1501
1502 len = data_offset - server->total_read;
1503 if (len > 0) {
1504 /* read any junk before data into the rest of smallbuf */
1505 rdata->iov[0].iov_base = server->smallbuf + server->total_read;
1506 rdata->iov[0].iov_len = len;
1507 length = cifs_readv_from_socket(server, rdata->iov, 1, len);
1508 if (length < 0)
1509 return length;
1510 server->total_read += length;
1511 }
1512
1513 /* set up first iov for signature check */
1514 rdata->iov[0].iov_base = server->smallbuf;
1515 rdata->iov[0].iov_len = server->total_read;
1516 cFYI(1, "0: iov_base=%p iov_len=%zu",
1517 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
1518
1519 /* how much data is in the response? */
1520 data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
1521 data_len += le16_to_cpu(rsp->DataLength);
1522 if (data_offset + data_len > rfclen) {
1523 /* data_len is corrupt -- discard frame */
1524 rdata->result = -EIO;
1525 return cifs_readv_discard(server, mid);
1526 }
1527
1528 /* marshal up the page array */
1529 len = 0;
1530 remaining = data_len;
1531 rdata->nr_iov = 1;
1532
1533 /* determine the eof that the server (probably) has */
1534 eof = CIFS_I(rdata->mapping->host)->server_eof;
1535 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
1536 cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
1537
1538 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
1539 if (remaining >= PAGE_CACHE_SIZE) {
1540 /* enough data to fill the page */
1541 rdata->iov[rdata->nr_iov].iov_base = kmap(page);
1542 rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
1543 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
1544 rdata->nr_iov, page->index,
1545 rdata->iov[rdata->nr_iov].iov_base,
1546 rdata->iov[rdata->nr_iov].iov_len);
1547 ++rdata->nr_iov;
1548 len += PAGE_CACHE_SIZE;
1549 remaining -= PAGE_CACHE_SIZE;
1550 } else if (remaining > 0) {
1551 /* enough for partial page, fill and zero the rest */
1552 rdata->iov[rdata->nr_iov].iov_base = kmap(page);
1553 rdata->iov[rdata->nr_iov].iov_len = remaining;
1554 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
1555 rdata->nr_iov, page->index,
1556 rdata->iov[rdata->nr_iov].iov_base,
1557 rdata->iov[rdata->nr_iov].iov_len);
1558 memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
1559 '\0', PAGE_CACHE_SIZE - remaining);
1560 ++rdata->nr_iov;
1561 len += remaining;
1562 remaining = 0;
1563 } else if (page->index > eof_index) {
1564 /*
1565 * The VFS will not try to do readahead past the
1566 * i_size, but it's possible that we have outstanding
1567 * writes with gaps in the middle and the i_size hasn't
1568 * caught up yet. Populate those with zeroed out pages
1569 * to prevent the VFS from repeatedly attempting to
1570 * fill them until the writes are flushed.
1571 */
1572 zero_user(page, 0, PAGE_CACHE_SIZE);
1573 list_del(&page->lru);
1574 lru_cache_add_file(page);
1575 flush_dcache_page(page);
1576 SetPageUptodate(page);
1577 unlock_page(page);
1578 page_cache_release(page);
1579 } else {
1580 /* no need to hold page hostage */
1581 list_del(&page->lru);
1582 lru_cache_add_file(page);
1583 unlock_page(page);
1584 page_cache_release(page);
1585 }
1586 }
1587
1588 /* issue the read if we have any iovecs left to fill */
1589 if (rdata->nr_iov > 1) {
1590 length = cifs_readv_from_socket(server, &rdata->iov[1],
1591 rdata->nr_iov - 1, len);
1592 if (length < 0)
1593 return length;
1594 server->total_read += length;
1595 } else {
1596 length = 0;
1597 }
1598
1599 rdata->bytes = length;
1600
1601 cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
1602 rfclen, remaining);
1603
1604 /* discard anything left over */
1605 if (server->total_read < rfclen)
1606 return cifs_readv_discard(server, mid);
1607
1608 dequeue_mid(mid, false);
1609 return length;
1610}
1611
1612static void
1613cifs_readv_complete(struct work_struct *work)
1614{
1615 struct cifs_readdata *rdata = container_of(work,
1616 struct cifs_readdata, work);
1617 struct page *page, *tpage;
1618
1619 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
1620 list_del(&page->lru);
1621 lru_cache_add_file(page);
1622
1623 if (rdata->result == 0) {
1624 kunmap(page);
1625 flush_dcache_page(page);
1626 SetPageUptodate(page);
1627 }
1628
1629 unlock_page(page);
1630
1631 if (rdata->result == 0)
1632 cifs_readpage_to_fscache(rdata->mapping->host, page);
1633
1634 page_cache_release(page);
1635 }
1636 cifs_readdata_free(rdata);
1637}
1638
1639static void
1640cifs_readv_callback(struct mid_q_entry *mid)
1641{
1642 struct cifs_readdata *rdata = mid->callback_data;
1643 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1644 struct TCP_Server_Info *server = tcon->ses->server;
1645
1646 cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
1647 mid->mid, mid->midState, rdata->result, rdata->bytes);
1648
1649 switch (mid->midState) {
1650 case MID_RESPONSE_RECEIVED:
1651 /* result already set, check signature */
1652 if (server->sec_mode &
1653 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1654 if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
1655 server, mid->sequence_number + 1))
1656 cERROR(1, "Unexpected SMB signature");
1657 }
1658 /* FIXME: should this be counted toward the initiating task? */
1659 task_io_account_read(rdata->bytes);
1660 cifs_stats_bytes_read(tcon, rdata->bytes);
1661 break;
1662 case MID_REQUEST_SUBMITTED:
1663 case MID_RETRY_NEEDED:
1664 rdata->result = -EAGAIN;
1665 break;
1666 default:
1667 rdata->result = -EIO;
1668 }
1669
1670 queue_work(system_nrt_wq, &rdata->work);
1671 DeleteMidQEntry(mid);
1672 atomic_dec(&server->inFlight);
1673 wake_up(&server->request_q);
1674}
1675
1676/* cifs_async_readv - send an async write, and set up mid to handle result */
1677int
1678cifs_async_readv(struct cifs_readdata *rdata)
1679{
1680 int rc;
1681 READ_REQ *smb = NULL;
1682 int wct;
1683 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1684
1685 cFYI(1, "%s: offset=%llu bytes=%u", __func__,
1686 rdata->offset, rdata->bytes);
1687
1688 if (tcon->ses->capabilities & CAP_LARGE_FILES)
1689 wct = 12;
1690 else {
1691 wct = 10; /* old style read */
1692 if ((rdata->offset >> 32) > 0) {
1693 /* can not handle this big offset for old */
1694 return -EIO;
1695 }
1696 }
1697
1698 rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
1699 if (rc)
1700 return rc;
1701
1702 smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
1703 smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
1704
1705 smb->AndXCommand = 0xFF; /* none */
1706 smb->Fid = rdata->cfile->netfid;
1707 smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
1708 if (wct == 12)
1709 smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
1710 smb->Remaining = 0;
1711 smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
1712 smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
1713 if (wct == 12)
1714 smb->ByteCount = 0;
1715 else {
1716 /* old style read */
1717 struct smb_com_readx_req *smbr =
1718 (struct smb_com_readx_req *)smb;
1719 smbr->ByteCount = 0;
1720 }
1721
1722 /* 4 for RFC1001 length + 1 for BCC */
1723 rdata->iov[0].iov_base = smb;
1724 rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
1725
1726 rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
1727 cifs_readv_receive, cifs_readv_callback,
1728 rdata, false);
1729
1730 if (rc == 0)
1731 cifs_stats_inc(&tcon->num_reads);
1732
1733 cifs_small_buf_release(smb);
1734 return rc;
1735}
1736
1379int 1737int
1380CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, 1738CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
1381 char **buf, int *pbuf_type) 1739 char **buf, int *pbuf_type)
@@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
1836 2194
1837 kref_get(&wdata->refcount); 2195 kref_get(&wdata->refcount);
1838 rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, 2196 rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
1839 cifs_writev_callback, wdata, false); 2197 NULL, cifs_writev_callback, wdata, false);
1840 2198
1841 if (rc == 0) 2199 if (rc == 0)
1842 cifs_stats_inc(&tcon->num_writes); 2200 cifs_stats_inc(&tcon->num_writes);
@@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
1962 return rc; 2320 return rc;
1963} 2321}
1964 2322
2323int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
2324 const __u8 lock_type, const __u32 num_unlock,
2325 const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
2326{
2327 int rc = 0;
2328 LOCK_REQ *pSMB = NULL;
2329 struct kvec iov[2];
2330 int resp_buf_type;
2331 __u16 count;
2332
2333 cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
2334
2335 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
2336 if (rc)
2337 return rc;
2338
2339 pSMB->Timeout = 0;
2340 pSMB->NumberOfLocks = cpu_to_le16(num_lock);
2341 pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
2342 pSMB->LockType = lock_type;
2343 pSMB->AndXCommand = 0xFF; /* none */
2344 pSMB->Fid = netfid; /* netfid stays le */
2345
2346 count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2347 inc_rfc1001_len(pSMB, count);
2348 pSMB->ByteCount = cpu_to_le16(count);
2349
2350 iov[0].iov_base = (char *)pSMB;
2351 iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
2352 (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2353 iov[1].iov_base = (char *)buf;
2354 iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2355
2356 cifs_stats_inc(&tcon->num_locks);
2357 rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
2358 if (rc)
2359 cFYI(1, "Send error in cifs_lockv = %d", rc);
2360
2361 return rc;
2362}
1965 2363
1966int 2364int
1967CIFSSMBLock(const int xid, struct cifs_tcon *tcon, 2365CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
1968 const __u16 smb_file_id, const __u64 len, 2366 const __u16 smb_file_id, const __u32 netpid, const __u64 len,
1969 const __u64 offset, const __u32 numUnlock, 2367 const __u64 offset, const __u32 numUnlock,
1970 const __u32 numLock, const __u8 lockType, 2368 const __u32 numLock, const __u8 lockType,
1971 const bool waitFlag, const __u8 oplock_level) 2369 const bool waitFlag, const __u8 oplock_level)
@@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
2001 pSMB->Fid = smb_file_id; /* netfid stays le */ 2399 pSMB->Fid = smb_file_id; /* netfid stays le */
2002 2400
2003 if ((numLock != 0) || (numUnlock != 0)) { 2401 if ((numLock != 0) || (numUnlock != 0)) {
2004 pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); 2402 pSMB->Locks[0].Pid = cpu_to_le16(netpid);
2005 /* BB where to store pid high? */ 2403 /* BB where to store pid high? */
2006 pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); 2404 pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
2007 pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); 2405 pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
2035 2433
2036int 2434int
2037CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, 2435CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
2038 const __u16 smb_file_id, const int get_flag, const __u64 len, 2436 const __u16 smb_file_id, const __u32 netpid, const int get_flag,
2039 struct file_lock *pLockData, const __u16 lock_type, 2437 const __u64 len, struct file_lock *pLockData,
2040 const bool waitFlag) 2438 const __u16 lock_type, const bool waitFlag)
2041{ 2439{
2042 struct smb_com_transaction2_sfi_req *pSMB = NULL; 2440 struct smb_com_transaction2_sfi_req *pSMB = NULL;
2043 struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; 2441 struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
2095 } else 2493 } else
2096 pSMB->Timeout = 0; 2494 pSMB->Timeout = 0;
2097 2495
2098 parm_data->pid = cpu_to_le32(current->tgid); 2496 parm_data->pid = cpu_to_le32(netpid);
2099 parm_data->start = cpu_to_le64(pLockData->fl_start); 2497 parm_data->start = cpu_to_le64(pLockData->fl_start);
2100 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ 2498 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */
2101 2499
@@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
2812 pSMB->TotalDataCount = 0; 3210 pSMB->TotalDataCount = 0;
2813 pSMB->MaxParameterCount = cpu_to_le32(2); 3211 pSMB->MaxParameterCount = cpu_to_le32(2);
2814 /* BB find exact data count max from sess structure BB */ 3212 /* BB find exact data count max from sess structure BB */
2815 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - 3213 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
2816 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2817 pSMB->MaxSetupCount = 4; 3214 pSMB->MaxSetupCount = 4;
2818 pSMB->Reserved = 0; 3215 pSMB->Reserved = 0;
2819 pSMB->ParameterOffset = 0; 3216 pSMB->ParameterOffset = 0;
@@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
3306 pSMB->Reserved = 0; 3703 pSMB->Reserved = 0;
3307 pSMB->TotalParameterCount = cpu_to_le32(parm_len); 3704 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
3308 pSMB->TotalDataCount = 0; 3705 pSMB->TotalDataCount = 0;
3309 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - 3706 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
3310 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3311 pSMB->ParameterCount = pSMB->TotalParameterCount; 3707 pSMB->ParameterCount = pSMB->TotalParameterCount;
3312 pSMB->DataCount = pSMB->TotalDataCount; 3708 pSMB->DataCount = pSMB->TotalDataCount;
3313 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + 3709 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3467,7 +3863,7 @@ qsec_out:
3467 3863
3468int 3864int
3469CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, 3865CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
3470 struct cifs_ntsd *pntsd, __u32 acllen) 3866 struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
3471{ 3867{
3472 __u16 byte_count, param_count, data_count, param_offset, data_offset; 3868 __u16 byte_count, param_count, data_count, param_offset, data_offset;
3473 int rc = 0; 3869 int rc = 0;
@@ -3504,7 +3900,7 @@ setCifsAclRetry:
3504 3900
3505 pSMB->Fid = fid; /* file handle always le */ 3901 pSMB->Fid = fid; /* file handle always le */
3506 pSMB->Reserved2 = 0; 3902 pSMB->Reserved2 = 0;
3507 pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); 3903 pSMB->AclFlags = cpu_to_le32(aclflag);
3508 3904
3509 if (pntsd && acllen) { 3905 if (pntsd && acllen) {
3510 memcpy((char *) &pSMBr->hdr.Protocol + data_offset, 3906 memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
@@ -3977,8 +4373,7 @@ findFirstRetry:
3977 params = 12 + name_len /* includes null */ ; 4373 params = 12 + name_len /* includes null */ ;
3978 pSMB->TotalDataCount = 0; /* no EAs */ 4374 pSMB->TotalDataCount = 0; /* no EAs */
3979 pSMB->MaxParameterCount = cpu_to_le16(10); 4375 pSMB->MaxParameterCount = cpu_to_le16(10);
3980 pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - 4376 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
3981 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3982 pSMB->MaxSetupCount = 0; 4377 pSMB->MaxSetupCount = 0;
3983 pSMB->Reserved = 0; 4378 pSMB->Reserved = 0;
3984 pSMB->Flags = 0; 4379 pSMB->Flags = 0;
@@ -4052,8 +4447,7 @@ findFirstRetry:
4052 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + 4447 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
4053 psrch_inf->entries_in_buffer; 4448 psrch_inf->entries_in_buffer;
4054 lnoff = le16_to_cpu(parms->LastNameOffset); 4449 lnoff = le16_to_cpu(parms->LastNameOffset);
4055 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 4450 if (CIFSMaxBufSize < lnoff) {
4056 lnoff) {
4057 cERROR(1, "ignoring corrupt resume name"); 4451 cERROR(1, "ignoring corrupt resume name");
4058 psrch_inf->last_entry = NULL; 4452 psrch_inf->last_entry = NULL;
4059 return rc; 4453 return rc;
@@ -4097,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4097 byte_count = 0; 4491 byte_count = 0;
4098 pSMB->TotalDataCount = 0; /* no EAs */ 4492 pSMB->TotalDataCount = 0; /* no EAs */
4099 pSMB->MaxParameterCount = cpu_to_le16(8); 4493 pSMB->MaxParameterCount = cpu_to_le16(8);
4100 pSMB->MaxDataCount = 4494 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
4101 cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
4102 0xFFFFFF00);
4103 pSMB->MaxSetupCount = 0; 4495 pSMB->MaxSetupCount = 0;
4104 pSMB->Reserved = 0; 4496 pSMB->Reserved = 0;
4105 pSMB->Flags = 0; 4497 pSMB->Flags = 0;
@@ -4181,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4181 psrch_inf->index_of_last_entry += 4573 psrch_inf->index_of_last_entry +=
4182 psrch_inf->entries_in_buffer; 4574 psrch_inf->entries_in_buffer;
4183 lnoff = le16_to_cpu(parms->LastNameOffset); 4575 lnoff = le16_to_cpu(parms->LastNameOffset);
4184 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 4576 if (CIFSMaxBufSize < lnoff) {
4185 lnoff) {
4186 cERROR(1, "ignoring corrupt resume name"); 4577 cERROR(1, "ignoring corrupt resume name");
4187 psrch_inf->last_entry = NULL; 4578 psrch_inf->last_entry = NULL;
4188 return rc; 4579 return rc;
@@ -5840,7 +6231,7 @@ QAllEAsRetry:
5840 6231
5841 if (ea_name) { 6232 if (ea_name) {
5842 if (ea_name_len == name_len && 6233 if (ea_name_len == name_len &&
5843 strncmp(ea_name, temp_ptr, name_len) == 0) { 6234 memcmp(ea_name, temp_ptr, name_len) == 0) {
5844 temp_ptr += name_len + 1; 6235 temp_ptr += name_len + 1;
5845 rc = value_len; 6236 rc = value_len;
5846 if (buf_size == 0) 6237 if (buf_size == 0)
@@ -6035,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
6035 pSMB->TotalParameterCount = 0 ; 6426 pSMB->TotalParameterCount = 0 ;
6036 pSMB->TotalDataCount = 0; 6427 pSMB->TotalDataCount = 0;
6037 pSMB->MaxParameterCount = cpu_to_le32(2); 6428 pSMB->MaxParameterCount = cpu_to_le32(2);
6038 /* BB find exact data count max from sess structure BB */ 6429 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
6039 pSMB->MaxDataCount = 0; /* same in little endian or be */
6040/* BB VERIFY verify which is correct for above BB */
6041 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
6042 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
6043
6044 pSMB->MaxSetupCount = 4; 6430 pSMB->MaxSetupCount = 4;
6045 pSMB->Reserved = 0; 6431 pSMB->Reserved = 0;
6046 pSMB->ParameterOffset = 0; 6432 pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 62abf9fd6ff..d6a972df033 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/module.h>
40#include <net/ipv6.h> 41#include <net/ipv6.h>
41#include "cifspdu.h" 42#include "cifspdu.h"
42#include "cifsglob.h" 43#include "cifsglob.h"
@@ -181,7 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
181 -EINVAL = invalid transact2 182 -EINVAL = invalid transact2
182 183
183 */ 184 */
184static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 185static int check2ndT2(struct smb_hdr *pSMB)
185{ 186{
186 struct smb_t2_rsp *pSMBt; 187 struct smb_t2_rsp *pSMBt;
187 int remaining; 188 int remaining;
@@ -214,9 +215,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
214 215
215 cFYI(1, "missing %d bytes from transact2, check next response", 216 cFYI(1, "missing %d bytes from transact2, check next response",
216 remaining); 217 remaining);
217 if (total_data_size > maxBufSize) { 218 if (total_data_size > CIFSMaxBufSize) {
218 cERROR(1, "TotalDataSize %d is over maximum buffer %d", 219 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
219 total_data_size, maxBufSize); 220 total_data_size, CIFSMaxBufSize);
220 return -EINVAL; 221 return -EINVAL;
221 } 222 }
222 return remaining; 223 return remaining;
@@ -320,27 +321,24 @@ requeue_echo:
320} 321}
321 322
322static bool 323static bool
323allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, 324allocate_buffers(struct TCP_Server_Info *server)
324 bool is_large_buf)
325{ 325{
326 char *bbuf = *bigbuf, *sbuf = *smallbuf; 326 if (!server->bigbuf) {
327 327 server->bigbuf = (char *)cifs_buf_get();
328 if (bbuf == NULL) { 328 if (!server->bigbuf) {
329 bbuf = (char *)cifs_buf_get();
330 if (!bbuf) {
331 cERROR(1, "No memory for large SMB response"); 329 cERROR(1, "No memory for large SMB response");
332 msleep(3000); 330 msleep(3000);
333 /* retry will check if exiting */ 331 /* retry will check if exiting */
334 return false; 332 return false;
335 } 333 }
336 } else if (is_large_buf) { 334 } else if (server->large_buf) {
337 /* we are reusing a dirty large buf, clear its start */ 335 /* we are reusing a dirty large buf, clear its start */
338 memset(bbuf, 0, size); 336 memset(server->bigbuf, 0, sizeof(struct smb_hdr));
339 } 337 }
340 338
341 if (sbuf == NULL) { 339 if (!server->smallbuf) {
342 sbuf = (char *)cifs_small_buf_get(); 340 server->smallbuf = (char *)cifs_small_buf_get();
343 if (!sbuf) { 341 if (!server->smallbuf) {
344 cERROR(1, "No memory for SMB response"); 342 cERROR(1, "No memory for SMB response");
345 msleep(1000); 343 msleep(1000);
346 /* retry will check if exiting */ 344 /* retry will check if exiting */
@@ -349,36 +347,116 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
349 /* beginning of smb buffer is cleared in our buf_get */ 347 /* beginning of smb buffer is cleared in our buf_get */
350 } else { 348 } else {
351 /* if existing small buf clear beginning */ 349 /* if existing small buf clear beginning */
352 memset(sbuf, 0, size); 350 memset(server->smallbuf, 0, sizeof(struct smb_hdr));
353 } 351 }
354 352
355 *bigbuf = bbuf;
356 *smallbuf = sbuf;
357
358 return true; 353 return true;
359} 354}
360 355
361static int 356static bool
362read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, 357server_unresponsive(struct TCP_Server_Info *server)
363 struct kvec *iov, unsigned int to_read, 358{
364 unsigned int *ptotal_read, bool is_header_read) 359 if (echo_retries > 0 && server->tcpStatus == CifsGood &&
360 time_after(jiffies, server->lstrp +
361 (echo_retries * SMB_ECHO_INTERVAL))) {
362 cERROR(1, "Server %s has not responded in %d seconds. "
363 "Reconnecting...", server->hostname,
364 (echo_retries * SMB_ECHO_INTERVAL / HZ));
365 cifs_reconnect(server);
366 wake_up(&server->response_q);
367 return true;
368 }
369
370 return false;
371}
372
373/*
374 * kvec_array_init - clone a kvec array, and advance into it
375 * @new: pointer to memory for cloned array
376 * @iov: pointer to original array
377 * @nr_segs: number of members in original array
378 * @bytes: number of bytes to advance into the cloned array
379 *
380 * This function will copy the array provided in iov to a section of memory
381 * and advance the specified number of bytes into the new array. It returns
382 * the number of segments in the new array. "new" must be at least as big as
383 * the original iov array.
384 */
385static unsigned int
386kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
387 size_t bytes)
388{
389 size_t base = 0;
390
391 while (bytes || !iov->iov_len) {
392 int copy = min(bytes, iov->iov_len);
393
394 bytes -= copy;
395 base += copy;
396 if (iov->iov_len == base) {
397 iov++;
398 nr_segs--;
399 base = 0;
400 }
401 }
402 memcpy(new, iov, sizeof(*iov) * nr_segs);
403 new->iov_base += base;
404 new->iov_len -= base;
405 return nr_segs;
406}
407
408static struct kvec *
409get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
410{
411 struct kvec *new_iov;
412
413 if (server->iov && nr_segs <= server->nr_iov)
414 return server->iov;
415
416 /* not big enough -- allocate a new one and release the old */
417 new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
418 if (new_iov) {
419 kfree(server->iov);
420 server->iov = new_iov;
421 server->nr_iov = nr_segs;
422 }
423 return new_iov;
424}
425
426int
427cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
428 unsigned int nr_segs, unsigned int to_read)
365{ 429{
366 int length, rc = 0; 430 int length = 0;
367 unsigned int total_read; 431 int total_read;
368 char *buf = iov->iov_base; 432 unsigned int segs;
433 struct msghdr smb_msg;
434 struct kvec *iov;
435
436 iov = get_server_iovec(server, nr_segs);
437 if (!iov)
438 return -ENOMEM;
439
440 smb_msg.msg_control = NULL;
441 smb_msg.msg_controllen = 0;
442
443 for (total_read = 0; to_read; total_read += length, to_read -= length) {
444 if (server_unresponsive(server)) {
445 total_read = -EAGAIN;
446 break;
447 }
448
449 segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
450
451 length = kernel_recvmsg(server->ssocket, &smb_msg,
452 iov, segs, to_read, 0);
369 453
370 for (total_read = 0; total_read < to_read; total_read += length) {
371 length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
372 to_read - total_read, 0);
373 if (server->tcpStatus == CifsExiting) { 454 if (server->tcpStatus == CifsExiting) {
374 /* then will exit */ 455 total_read = -ESHUTDOWN;
375 rc = 2;
376 break; 456 break;
377 } else if (server->tcpStatus == CifsNeedReconnect) { 457 } else if (server->tcpStatus == CifsNeedReconnect) {
378 cifs_reconnect(server); 458 cifs_reconnect(server);
379 /* Reconnect wakes up rspns q */ 459 total_read = -EAGAIN;
380 /* Now we will reread sock */
381 rc = 1;
382 break; 460 break;
383 } else if (length == -ERESTARTSYS || 461 } else if (length == -ERESTARTSYS ||
384 length == -EAGAIN || 462 length == -EAGAIN ||
@@ -390,56 +468,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
390 */ 468 */
391 usleep_range(1000, 2000); 469 usleep_range(1000, 2000);
392 length = 0; 470 length = 0;
393 if (!is_header_read) 471 continue;
394 continue;
395 /* Special handling for header read */
396 if (total_read) {
397 iov->iov_base = (to_read - total_read) +
398 buf;
399 iov->iov_len = to_read - total_read;
400 smb_msg->msg_control = NULL;
401 smb_msg->msg_controllen = 0;
402 rc = 3;
403 } else
404 rc = 1;
405 break;
406 } else if (length <= 0) { 472 } else if (length <= 0) {
407 cERROR(1, "Received no data, expecting %d", 473 cFYI(1, "Received no data or error: expecting %d "
408 to_read - total_read); 474 "got %d", to_read, length);
409 cifs_reconnect(server); 475 cifs_reconnect(server);
410 rc = 1; 476 total_read = -EAGAIN;
411 break; 477 break;
412 } 478 }
413 } 479 }
480 return total_read;
481}
414 482
415 *ptotal_read = total_read; 483int
416 return rc; 484cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
485 unsigned int to_read)
486{
487 struct kvec iov;
488
489 iov.iov_base = buf;
490 iov.iov_len = to_read;
491
492 return cifs_readv_from_socket(server, &iov, 1, to_read);
417} 493}
418 494
419static bool 495static bool
420check_rfc1002_header(struct TCP_Server_Info *server, char *buf) 496is_smb_response(struct TCP_Server_Info *server, unsigned char type)
421{ 497{
422 char temp = *buf;
423 unsigned int pdu_length = be32_to_cpu(
424 ((struct smb_hdr *)buf)->smb_buf_length);
425
426 /* 498 /*
427 * The first byte big endian of the length field, 499 * The first byte big endian of the length field,
428 * is actually not part of the length but the type 500 * is actually not part of the length but the type
429 * with the most common, zero, as regular data. 501 * with the most common, zero, as regular data.
430 */ 502 */
431 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { 503 switch (type) {
432 return false; 504 case RFC1002_SESSION_MESSAGE:
433 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { 505 /* Regular SMB response */
434 cFYI(1, "Good RFC 1002 session rsp"); 506 return true;
435 return false; 507 case RFC1002_SESSION_KEEP_ALIVE:
436 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { 508 cFYI(1, "RFC 1002 session keep alive");
509 break;
510 case RFC1002_POSITIVE_SESSION_RESPONSE:
511 cFYI(1, "RFC 1002 positive session response");
512 break;
513 case RFC1002_NEGATIVE_SESSION_RESPONSE:
437 /* 514 /*
438 * We get this from Windows 98 instead of an error on 515 * We get this from Windows 98 instead of an error on
439 * SMB negprot response. 516 * SMB negprot response.
440 */ 517 */
441 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", 518 cFYI(1, "RFC 1002 negative session response");
442 pdu_length);
443 /* give server a second to clean up */ 519 /* give server a second to clean up */
444 msleep(1000); 520 msleep(1000);
445 /* 521 /*
@@ -448,87 +524,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
448 * is since we do not begin with RFC1001 session 524 * is since we do not begin with RFC1001 session
449 * initialize frame). 525 * initialize frame).
450 */ 526 */
451 cifs_set_port((struct sockaddr *) 527 cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
452 &server->dstaddr, CIFS_PORT);
453 cifs_reconnect(server); 528 cifs_reconnect(server);
454 wake_up(&server->response_q); 529 wake_up(&server->response_q);
455 return false; 530 break;
456 } else if (temp != (char) 0) { 531 default:
457 cERROR(1, "Unknown RFC 1002 frame"); 532 cERROR(1, "RFC 1002 unknown response type 0x%x", type);
458 cifs_dump_mem(" Received Data: ", buf, 4);
459 cifs_reconnect(server);
460 return false;
461 }
462
463 /* else we have an SMB response */
464 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
465 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
466 cERROR(1, "Invalid size SMB length %d pdu_length %d",
467 4, pdu_length+4);
468 cifs_reconnect(server); 533 cifs_reconnect(server);
469 wake_up(&server->response_q);
470 return false;
471 } 534 }
472 535
473 return true; 536 return false;
474} 537}
475 538
476static struct mid_q_entry * 539static struct mid_q_entry *
477find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, 540find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
478 int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
479{ 541{
480 struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; 542 struct mid_q_entry *mid;
481 543
482 spin_lock(&GlobalMid_Lock); 544 spin_lock(&GlobalMid_Lock);
483 list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { 545 list_for_each_entry(mid, &server->pending_mid_q, qhead) {
484 if (mid->mid != buf->Mid || 546 if (mid->mid == buf->Mid &&
485 mid->midState != MID_REQUEST_SUBMITTED || 547 mid->midState == MID_REQUEST_SUBMITTED &&
486 mid->command != buf->Command) 548 mid->command == buf->Command) {
487 continue; 549 spin_unlock(&GlobalMid_Lock);
488 550 return mid;
489 if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
490 /* We have a multipart transact2 resp */
491 *is_multi_rsp = true;
492 if (mid->resp_buf) {
493 /* merge response - fix up 1st*/
494 *length = coalesce_t2(buf, mid->resp_buf);
495 if (*length > 0) {
496 *length = 0;
497 mid->multiRsp = true;
498 break;
499 }
500 /* All parts received or packet is malformed. */
501 mid->multiEnd = true;
502 goto multi_t2_fnd;
503 }
504 if (!is_large_buf) {
505 /*FIXME: switch to already allocated largebuf?*/
506 cERROR(1, "1st trans2 resp needs bigbuf");
507 } else {
508 /* Have first buffer */
509 mid->resp_buf = buf;
510 mid->largeBuf = true;
511 *bigbuf = NULL;
512 }
513 break;
514 } 551 }
515 mid->resp_buf = buf; 552 }
516 mid->largeBuf = is_large_buf; 553 spin_unlock(&GlobalMid_Lock);
517multi_t2_fnd: 554 return NULL;
518 if (*length == 0) 555}
519 mid->midState = MID_RESPONSE_RECEIVED; 556
520 else 557void
521 mid->midState = MID_RESPONSE_MALFORMED; 558dequeue_mid(struct mid_q_entry *mid, bool malformed)
559{
522#ifdef CONFIG_CIFS_STATS2 560#ifdef CONFIG_CIFS_STATS2
523 mid->when_received = jiffies; 561 mid->when_received = jiffies;
524#endif 562#endif
525 list_del_init(&mid->qhead); 563 spin_lock(&GlobalMid_Lock);
526 ret = mid; 564 if (!malformed)
527 break; 565 mid->midState = MID_RESPONSE_RECEIVED;
528 } 566 else
567 mid->midState = MID_RESPONSE_MALFORMED;
568 list_del_init(&mid->qhead);
529 spin_unlock(&GlobalMid_Lock); 569 spin_unlock(&GlobalMid_Lock);
570}
530 571
531 return ret; 572static void
573handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
574 struct smb_hdr *buf, int malformed)
575{
576 if (malformed == 0 && check2ndT2(buf) > 0) {
577 mid->multiRsp = true;
578 if (mid->resp_buf) {
579 /* merge response - fix up 1st*/
580 malformed = coalesce_t2(buf, mid->resp_buf);
581 if (malformed > 0)
582 return;
583
584 /* All parts received or packet is malformed. */
585 mid->multiEnd = true;
586 return dequeue_mid(mid, malformed);
587 }
588 if (!server->large_buf) {
589 /*FIXME: switch to already allocated largebuf?*/
590 cERROR(1, "1st trans2 resp needs bigbuf");
591 } else {
592 /* Have first buffer */
593 mid->resp_buf = buf;
594 mid->largeBuf = true;
595 server->bigbuf = NULL;
596 }
597 return;
598 }
599 mid->resp_buf = buf;
600 mid->largeBuf = server->large_buf;
601 /* Was previous buf put in mpx struct for multi-rsp? */
602 if (!mid->multiRsp) {
603 /* smb buffer will be freed by user thread */
604 if (server->large_buf)
605 server->bigbuf = NULL;
606 else
607 server->smallbuf = NULL;
608 }
609 dequeue_mid(mid, malformed);
532} 610}
533 611
534static void clean_demultiplex_info(struct TCP_Server_Info *server) 612static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -618,6 +696,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
618 } 696 }
619 697
620 kfree(server->hostname); 698 kfree(server->hostname);
699 kfree(server->iov);
621 kfree(server); 700 kfree(server);
622 701
623 length = atomic_dec_return(&tcpSesAllocCount); 702 length = atomic_dec_return(&tcpSesAllocCount);
@@ -627,20 +706,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
627} 706}
628 707
629static int 708static int
709standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
710{
711 int length;
712 char *buf = server->smallbuf;
713 struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
714 unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
715
716 /* make sure this will fit in a large buffer */
717 if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
718 cERROR(1, "SMB response too long (%u bytes)",
719 pdu_length);
720 cifs_reconnect(server);
721 wake_up(&server->response_q);
722 return -EAGAIN;
723 }
724
725 /* switch to large buffer if too big for a small one */
726 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
727 server->large_buf = true;
728 memcpy(server->bigbuf, server->smallbuf, server->total_read);
729 buf = server->bigbuf;
730 smb_buffer = (struct smb_hdr *)buf;
731 }
732
733 /* now read the rest */
734 length = cifs_read_from_socket(server,
735 buf + sizeof(struct smb_hdr) - 1,
736 pdu_length - sizeof(struct smb_hdr) + 1 + 4);
737 if (length < 0)
738 return length;
739 server->total_read += length;
740
741 dump_smb(smb_buffer, server->total_read);
742
743 /*
744 * We know that we received enough to get to the MID as we
745 * checked the pdu_length earlier. Now check to see
746 * if the rest of the header is OK. We borrow the length
747 * var for the rest of the loop to avoid a new stack var.
748 *
749 * 48 bytes is enough to display the header and a little bit
750 * into the payload for debugging purposes.
751 */
752 length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
753 if (length != 0)
754 cifs_dump_mem("Bad SMB: ", buf,
755 min_t(unsigned int, server->total_read, 48));
756
757 if (mid)
758 handle_mid(mid, server, smb_buffer, length);
759
760 return length;
761}
762
763static int
630cifs_demultiplex_thread(void *p) 764cifs_demultiplex_thread(void *p)
631{ 765{
632 int length; 766 int length;
633 struct TCP_Server_Info *server = p; 767 struct TCP_Server_Info *server = p;
634 unsigned int pdu_length, total_read; 768 unsigned int pdu_length;
635 char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; 769 char *buf = NULL;
636 struct smb_hdr *smb_buffer = NULL; 770 struct smb_hdr *smb_buffer = NULL;
637 struct msghdr smb_msg;
638 struct kvec iov;
639 struct task_struct *task_to_wake = NULL; 771 struct task_struct *task_to_wake = NULL;
640 struct mid_q_entry *mid_entry; 772 struct mid_q_entry *mid_entry;
641 bool isLargeBuf = false;
642 bool isMultiRsp = false;
643 int rc;
644 773
645 current->flags |= PF_MEMALLOC; 774 current->flags |= PF_MEMALLOC;
646 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); 775 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -655,111 +784,65 @@ cifs_demultiplex_thread(void *p)
655 if (try_to_freeze()) 784 if (try_to_freeze())
656 continue; 785 continue;
657 786
658 if (!allocate_buffers(&bigbuf, &smallbuf, 787 if (!allocate_buffers(server))
659 sizeof(struct smb_hdr), isLargeBuf))
660 continue; 788 continue;
661 789
662 isLargeBuf = false; 790 server->large_buf = false;
663 isMultiRsp = false; 791 smb_buffer = (struct smb_hdr *)server->smallbuf;
664 smb_buffer = (struct smb_hdr *)smallbuf; 792 buf = server->smallbuf;
665 buf = smallbuf;
666 iov.iov_base = buf;
667 iov.iov_len = 4;
668 smb_msg.msg_control = NULL;
669 smb_msg.msg_controllen = 0;
670 pdu_length = 4; /* enough to get RFC1001 header */ 793 pdu_length = 4; /* enough to get RFC1001 header */
671 794
672incomplete_rcv: 795 length = cifs_read_from_socket(server, buf, pdu_length);
673 if (echo_retries > 0 && server->tcpStatus == CifsGood && 796 if (length < 0)
674 time_after(jiffies, server->lstrp +
675 (echo_retries * SMB_ECHO_INTERVAL))) {
676 cERROR(1, "Server %s has not responded in %d seconds. "
677 "Reconnecting...", server->hostname,
678 (echo_retries * SMB_ECHO_INTERVAL / HZ));
679 cifs_reconnect(server);
680 wake_up(&server->response_q);
681 continue;
682 }
683
684 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
685 &total_read, true /* header read */);
686 if (rc == 3)
687 goto incomplete_rcv;
688 else if (rc == 2)
689 break;
690 else if (rc == 1)
691 continue; 797 continue;
798 server->total_read = length;
692 799
693 /* 800 /*
694 * The right amount was read from socket - 4 bytes, 801 * The right amount was read from socket - 4 bytes,
695 * so we can now interpret the length field. 802 * so we can now interpret the length field.
696 */ 803 */
697
698 /*
699 * Note that RFC 1001 length is big endian on the wire,
700 * but we convert it here so it is always manipulated
701 * as host byte order.
702 */
703 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); 804 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
704 805
705 cFYI(1, "rfc1002 length 0x%x", pdu_length+4); 806 cFYI(1, "RFC1002 header 0x%x", pdu_length);
706 if (!check_rfc1002_header(server, buf)) 807 if (!is_smb_response(server, buf[0]))
707 continue; 808 continue;
708 809
709 /* else length ok */ 810 /* make sure we have enough to get to the MID */
710 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { 811 if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
711 isLargeBuf = true; 812 cERROR(1, "SMB response too short (%u bytes)",
712 memcpy(bigbuf, smallbuf, 4); 813 pdu_length);
713 smb_buffer = (struct smb_hdr *)bigbuf; 814 cifs_reconnect(server);
714 buf = bigbuf; 815 wake_up(&server->response_q);
816 continue;
715 } 817 }
716 818
717 iov.iov_base = 4 + buf; 819 /* read down to the MID */
718 iov.iov_len = pdu_length; 820 length = cifs_read_from_socket(server, buf + 4,
719 rc = read_from_socket(server, &smb_msg, &iov, pdu_length, 821 sizeof(struct smb_hdr) - 1 - 4);
720 &total_read, false); 822 if (length < 0)
721 if (rc == 2)
722 break;
723 else if (rc == 1)
724 continue; 823 continue;
824 server->total_read += length;
725 825
726 total_read += 4; /* account for rfc1002 hdr */ 826 mid_entry = find_mid(server, smb_buffer);
727 827
728 dump_smb(smb_buffer, total_read); 828 if (!mid_entry || !mid_entry->receive)
829 length = standard_receive3(server, mid_entry);
830 else
831 length = mid_entry->receive(server, mid_entry);
729 832
730 /* 833 if (length < 0)
731 * We know that we received enough to get to the MID as we 834 continue;
732 * checked the pdu_length earlier. Now check to see
733 * if the rest of the header is OK. We borrow the length
734 * var for the rest of the loop to avoid a new stack var.
735 *
736 * 48 bytes is enough to display the header and a little bit
737 * into the payload for debugging purposes.
738 */
739 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
740 if (length != 0)
741 cifs_dump_mem("Bad SMB: ", buf,
742 min_t(unsigned int, total_read, 48));
743 835
744 server->lstrp = jiffies; 836 if (server->large_buf) {
837 buf = server->bigbuf;
838 smb_buffer = (struct smb_hdr *)buf;
839 }
745 840
746 mid_entry = find_cifs_mid(server, smb_buffer, &length, 841 server->lstrp = jiffies;
747 isLargeBuf, &isMultiRsp, &bigbuf);
748 if (mid_entry != NULL) { 842 if (mid_entry != NULL) {
749 mid_entry->callback(mid_entry); 843 if (!mid_entry->multiRsp || mid_entry->multiEnd)
750 /* Was previous buf put in mpx struct for multi-rsp? */ 844 mid_entry->callback(mid_entry);
751 if (!isMultiRsp) { 845 } else if (!is_valid_oplock_break(smb_buffer, server)) {
752 /* smb buffer will be freed by user thread */
753 if (isLargeBuf)
754 bigbuf = NULL;
755 else
756 smallbuf = NULL;
757 }
758 } else if (length != 0) {
759 /* response sanity checks failed */
760 continue;
761 } else if (!is_valid_oplock_break(smb_buffer, server) &&
762 !isMultiRsp) {
763 cERROR(1, "No task to wake, unknown frame received! " 846 cERROR(1, "No task to wake, unknown frame received! "
764 "NumMids %d", atomic_read(&midCount)); 847 "NumMids %d", atomic_read(&midCount));
765 cifs_dump_mem("Received Data is: ", buf, 848 cifs_dump_mem("Received Data is: ", buf,
@@ -773,9 +856,9 @@ incomplete_rcv:
773 } /* end while !EXITING */ 856 } /* end while !EXITING */
774 857
775 /* buffer usually freed in free_mid - need to free it here on exit */ 858 /* buffer usually freed in free_mid - need to free it here on exit */
776 cifs_buf_release(bigbuf); 859 cifs_buf_release(server->bigbuf);
777 if (smallbuf) /* no sense logging a debug message if NULL */ 860 if (server->smallbuf) /* no sense logging a debug message if NULL */
778 cifs_small_buf_release(smallbuf); 861 cifs_small_buf_release(server->smallbuf);
779 862
780 task_to_wake = xchg(&server->tsk, NULL); 863 task_to_wake = xchg(&server->tsk, NULL);
781 clean_demultiplex_info(server); 864 clean_demultiplex_info(server);
@@ -827,6 +910,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
827{ 910{
828 char *value, *data, *end; 911 char *value, *data, *end;
829 char *mountdata_copy = NULL, *options; 912 char *mountdata_copy = NULL, *options;
913 int err;
830 unsigned int temp_len, i, j; 914 unsigned int temp_len, i, j;
831 char separator[2]; 915 char separator[2];
832 short int override_uid = -1; 916 short int override_uid = -1;
@@ -883,6 +967,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
883 cFYI(1, "Null separator not allowed"); 967 cFYI(1, "Null separator not allowed");
884 } 968 }
885 } 969 }
970 vol->backupuid_specified = false; /* no backup intent for a user */
971 vol->backupgid_specified = false; /* no backup intent for a group */
886 972
887 while ((data = strsep(&options, separator)) != NULL) { 973 while ((data = strsep(&options, separator)) != NULL) {
888 if (!*data) 974 if (!*data)
@@ -1442,6 +1528,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1442 vol->mfsymlinks = true; 1528 vol->mfsymlinks = true;
1443 } else if (strnicmp(data, "multiuser", 8) == 0) { 1529 } else if (strnicmp(data, "multiuser", 8) == 0) {
1444 vol->multiuser = true; 1530 vol->multiuser = true;
1531 } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
1532 err = kstrtouint(value, 0, &vol->backupuid);
1533 if (err < 0) {
1534 cERROR(1, "%s: Invalid backupuid value",
1535 __func__);
1536 goto cifs_parse_mount_err;
1537 }
1538 vol->backupuid_specified = true;
1539 } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
1540 err = kstrtouint(value, 0, &vol->backupgid);
1541 if (err < 0) {
1542 cERROR(1, "%s: Invalid backupgid value",
1543 __func__);
1544 goto cifs_parse_mount_err;
1545 }
1546 vol->backupgid_specified = true;
1445 } else 1547 } else
1446 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1548 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1447 data); 1549 data);
@@ -2209,16 +2311,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
2209 (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) 2311 (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
2210 return 0; 2312 return 0;
2211 2313
2212 if (old->rsize != new->rsize)
2213 return 0;
2214
2215 /* 2314 /*
2216 * We want to share sb only if we don't specify wsize or specified wsize 2315 * We want to share sb only if we don't specify an r/wsize or
2217 * is greater or equal than existing one. 2316 * specified r/wsize is greater than or equal to existing one.
2218 */ 2317 */
2219 if (new->wsize && new->wsize < old->wsize) 2318 if (new->wsize && new->wsize < old->wsize)
2220 return 0; 2319 return 0;
2221 2320
2321 if (new->rsize && new->rsize < old->rsize)
2322 return 0;
2323
2222 if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) 2324 if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
2223 return 0; 2325 return 0;
2224 2326
@@ -2656,14 +2758,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
2656 CIFS_MOUNT_POSIX_PATHS; 2758 CIFS_MOUNT_POSIX_PATHS;
2657 } 2759 }
2658 2760
2659 if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
2660 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
2661 cifs_sb->rsize = 127 * 1024;
2662 cFYI(DBG2, "larger reads not supported by srv");
2663 }
2664 }
2665
2666
2667 cFYI(1, "Negotiate caps 0x%x", (int)cap); 2761 cFYI(1, "Negotiate caps 0x%x", (int)cap);
2668#ifdef CONFIG_CIFS_DEBUG2 2762#ifdef CONFIG_CIFS_DEBUG2
2669 if (cap & CIFS_UNIX_FCNTL_CAP) 2763 if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2708,31 +2802,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2708 spin_lock_init(&cifs_sb->tlink_tree_lock); 2802 spin_lock_init(&cifs_sb->tlink_tree_lock);
2709 cifs_sb->tlink_tree = RB_ROOT; 2803 cifs_sb->tlink_tree = RB_ROOT;
2710 2804
2711 if (pvolume_info->rsize > CIFSMaxBufSize) {
2712 cERROR(1, "rsize %d too large, using MaxBufSize",
2713 pvolume_info->rsize);
2714 cifs_sb->rsize = CIFSMaxBufSize;
2715 } else if ((pvolume_info->rsize) &&
2716 (pvolume_info->rsize <= CIFSMaxBufSize))
2717 cifs_sb->rsize = pvolume_info->rsize;
2718 else /* default */
2719 cifs_sb->rsize = CIFSMaxBufSize;
2720
2721 if (cifs_sb->rsize < 2048) {
2722 cifs_sb->rsize = 2048;
2723 /* Windows ME may prefer this */
2724 cFYI(1, "readsize set to minimum: 2048");
2725 }
2726
2727 /* 2805 /*
2728 * Temporarily set wsize for matching superblock. If we end up using 2806 * Temporarily set r/wsize for matching superblock. If we end up using
2729 * new sb then cifs_negotiate_wsize will later negotiate it downward 2807 * new sb then client will later negotiate it downward if needed.
2730 * if needed.
2731 */ 2808 */
2809 cifs_sb->rsize = pvolume_info->rsize;
2732 cifs_sb->wsize = pvolume_info->wsize; 2810 cifs_sb->wsize = pvolume_info->wsize;
2733 2811
2734 cifs_sb->mnt_uid = pvolume_info->linux_uid; 2812 cifs_sb->mnt_uid = pvolume_info->linux_uid;
2735 cifs_sb->mnt_gid = pvolume_info->linux_gid; 2813 cifs_sb->mnt_gid = pvolume_info->linux_gid;
2814 if (pvolume_info->backupuid_specified)
2815 cifs_sb->mnt_backupuid = pvolume_info->backupuid;
2816 if (pvolume_info->backupgid_specified)
2817 cifs_sb->mnt_backupgid = pvolume_info->backupgid;
2736 cifs_sb->mnt_file_mode = pvolume_info->file_mode; 2818 cifs_sb->mnt_file_mode = pvolume_info->file_mode;
2737 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; 2819 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
2738 cFYI(1, "file mode: 0x%x dir mode: 0x%x", 2820 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
@@ -2763,6 +2845,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2763 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; 2845 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
2764 if (pvolume_info->cifs_acl) 2846 if (pvolume_info->cifs_acl)
2765 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; 2847 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
2848 if (pvolume_info->backupuid_specified)
2849 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
2850 if (pvolume_info->backupgid_specified)
2851 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
2766 if (pvolume_info->override_uid) 2852 if (pvolume_info->override_uid)
2767 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; 2853 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
2768 if (pvolume_info->override_gid) 2854 if (pvolume_info->override_gid)
@@ -2795,29 +2881,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2795} 2881}
2796 2882
2797/* 2883/*
2798 * When the server supports very large writes via POSIX extensions, we can 2884 * When the server supports very large reads and writes via POSIX extensions,
2799 * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including 2885 * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
2800 * the RFC1001 length. 2886 * including the RFC1001 length.
2801 * 2887 *
2802 * Note that this might make for "interesting" allocation problems during 2888 * Note that this might make for "interesting" allocation problems during
2803 * writeback however as we have to allocate an array of pointers for the 2889 * writeback however as we have to allocate an array of pointers for the
2804 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. 2890 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
2891 *
2892 * For reads, there is a similar problem as we need to allocate an array
2893 * of kvecs to handle the receive, though that should only need to be done
2894 * once.
2805 */ 2895 */
2806#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) 2896#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
2897#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
2807 2898
2808/* 2899/*
2809 * When the server doesn't allow large posix writes, only allow a wsize of 2900 * When the server doesn't allow large posix writes, only allow a rsize/wsize
2810 * 128k minus the size of the WRITE_AND_X header. That allows for a write up 2901 * of 2^17-1 minus the size of the call header. That allows for a read or
2811 * to the maximum size described by RFC1002. 2902 * write up to the maximum size described by RFC1002.
2812 */ 2903 */
2813#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) 2904#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
2905#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
2814 2906
2815/* 2907/*
2816 * The default wsize is 1M. find_get_pages seems to return a maximum of 256 2908 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
2817 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill 2909 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
2818 * a single wsize request with a single call. 2910 * a single wsize request with a single call.
2819 */ 2911 */
2820#define CIFS_DEFAULT_WSIZE (1024 * 1024) 2912#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
2913
2914/*
2915 * Windows only supports a max of 60k reads. Default to that when posix
2916 * extensions aren't in force.
2917 */
2918#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
2821 2919
2822static unsigned int 2920static unsigned int
2823cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) 2921cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2825,7 +2923,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2825 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); 2923 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
2826 struct TCP_Server_Info *server = tcon->ses->server; 2924 struct TCP_Server_Info *server = tcon->ses->server;
2827 unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : 2925 unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
2828 CIFS_DEFAULT_WSIZE; 2926 CIFS_DEFAULT_IOSIZE;
2829 2927
2830 /* can server support 24-bit write sizes? (via UNIX extensions) */ 2928 /* can server support 24-bit write sizes? (via UNIX extensions) */
2831 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) 2929 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2848,6 +2946,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2848 return wsize; 2946 return wsize;
2849} 2947}
2850 2948
2949static unsigned int
2950cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2951{
2952 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
2953 struct TCP_Server_Info *server = tcon->ses->server;
2954 unsigned int rsize, defsize;
2955
2956 /*
2957 * Set default value...
2958 *
2959 * HACK alert! Ancient servers have very small buffers. Even though
2960 * MS-CIFS indicates that servers are only limited by the client's
2961 * bufsize for reads, testing against win98se shows that it throws
2962 * INVALID_PARAMETER errors if you try to request too large a read.
2963 *
2964 * If the server advertises a MaxBufferSize of less than one page,
2965 * assume that it also can't satisfy reads larger than that either.
2966 *
2967 * FIXME: Is there a better heuristic for this?
2968 */
2969 if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
2970 defsize = CIFS_DEFAULT_IOSIZE;
2971 else if (server->capabilities & CAP_LARGE_READ_X)
2972 defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
2973 else if (server->maxBuf >= PAGE_CACHE_SIZE)
2974 defsize = CIFSMaxBufSize;
2975 else
2976 defsize = server->maxBuf - sizeof(READ_RSP);
2977
2978 rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
2979
2980 /*
2981 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
2982 * the client's MaxBufferSize.
2983 */
2984 if (!(server->capabilities & CAP_LARGE_READ_X))
2985 rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
2986
2987 /* hard limit of CIFS_MAX_RSIZE */
2988 rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
2989
2990 return rsize;
2991}
2992
2851static int 2993static int
2852is_path_accessible(int xid, struct cifs_tcon *tcon, 2994is_path_accessible(int xid, struct cifs_tcon *tcon,
2853 struct cifs_sb_info *cifs_sb, const char *full_path) 2995 struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -3041,6 +3183,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
3041 return volume_info; 3183 return volume_info;
3042} 3184}
3043 3185
3186/* make sure ra_pages is a multiple of rsize */
3187static inline unsigned int
3188cifs_ra_pages(struct cifs_sb_info *cifs_sb)
3189{
3190 unsigned int reads;
3191 unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
3192
3193 if (rsize_pages >= default_backing_dev_info.ra_pages)
3194 return default_backing_dev_info.ra_pages;
3195 else if (rsize_pages == 0)
3196 return rsize_pages;
3197
3198 reads = default_backing_dev_info.ra_pages / rsize_pages;
3199 return reads * rsize_pages;
3200}
3201
3044int 3202int
3045cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) 3203cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3046{ 3204{
@@ -3059,8 +3217,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3059 if (rc) 3217 if (rc)
3060 return rc; 3218 return rc;
3061 3219
3062 cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
3063
3064#ifdef CONFIG_CIFS_DFS_UPCALL 3220#ifdef CONFIG_CIFS_DFS_UPCALL
3065try_mount_again: 3221try_mount_again:
3066 /* cleanup activities if we're chasing a referral */ 3222 /* cleanup activities if we're chasing a referral */
@@ -3125,15 +3281,11 @@ try_mount_again:
3125 CIFSSMBQFSAttributeInfo(xid, tcon); 3281 CIFSSMBQFSAttributeInfo(xid, tcon);
3126 } 3282 }
3127 3283
3128 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
3129 cifs_sb->rsize = 1024 * 127;
3130 cFYI(DBG2, "no very large read support, rsize now 127K");
3131 }
3132 if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
3133 cifs_sb->rsize = min(cifs_sb->rsize,
3134 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
3135
3136 cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); 3284 cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
3285 cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
3286
3287 /* tune readahead according to rsize */
3288 cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
3137 3289
3138remote_path_check: 3290remote_path_check:
3139#ifdef CONFIG_CIFS_DFS_UPCALL 3291#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -3301,7 +3453,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
3301 else 3453 else
3302#endif /* CIFS_WEAK_PW_HASH */ 3454#endif /* CIFS_WEAK_PW_HASH */
3303 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, 3455 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
3304 bcc_ptr); 3456 bcc_ptr, nls_codepage);
3305 3457
3306 bcc_ptr += CIFS_AUTH_RESP_SIZE; 3458 bcc_ptr += CIFS_AUTH_RESP_SIZE;
3307 if (ses->capabilities & CAP_UNICODE) { 3459 if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 72d448bf96c..d7eeb9d3ed6 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
171 } 171 }
172 tcon = tlink_tcon(tlink); 172 tcon = tlink_tcon(tlink);
173 173
174 if (oplockEnabled) 174 if (enable_oplocks)
175 oplock = REQ_OPLOCK; 175 oplock = REQ_OPLOCK;
176 176
177 if (nd) 177 if (nd)
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
244 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 244 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
245 create_options |= CREATE_OPTION_READONLY; 245 create_options |= CREATE_OPTION_READONLY;
246 246
247 if (backup_cred(cifs_sb))
248 create_options |= CREATE_OPEN_BACKUP_INTENT;
249
247 if (tcon->ses->capabilities & CAP_NT_SMBS) 250 if (tcon->ses->capabilities & CAP_NT_SMBS)
248 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 251 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
249 desiredAccess, create_options, 252 desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
357{ 360{
358 int rc = -EPERM; 361 int rc = -EPERM;
359 int xid; 362 int xid;
363 int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
360 struct cifs_sb_info *cifs_sb; 364 struct cifs_sb_info *cifs_sb;
361 struct tcon_link *tlink; 365 struct tcon_link *tlink;
362 struct cifs_tcon *pTcon; 366 struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
431 return rc; 435 return rc;
432 } 436 }
433 437
434 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ 438 if (backup_cred(cifs_sb))
439 create_options |= CREATE_OPEN_BACKUP_INTENT;
440
435 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, 441 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
436 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 442 GENERIC_WRITE, create_options,
437 &fileHandle, &oplock, buf, cifs_sb->local_nls, 443 &fileHandle, &oplock, buf, cifs_sb->local_nls,
438 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 444 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
439 if (rc) 445 if (rc)
@@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
642 if (direntry->d_inode) { 648 if (direntry->d_inode) {
643 if (cifs_revalidate_dentry(direntry)) 649 if (cifs_revalidate_dentry(direntry))
644 return 0; 650 return 0;
645 else 651 else {
652 /*
653 * Forcibly invalidate automounting directory inodes
654 * (remote DFS directories) so to have them
655 * instantiated again for automount
656 */
657 if (IS_AUTOMOUNT(direntry->d_inode))
658 return 0;
646 return 1; 659 return 1;
660 }
647 } 661 }
648 662
649 /* 663 /*
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac5200..9c7ecdccf2f 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
45#include "cifs_debug.h" 45#include "cifs_debug.h"
46#include "cifsfs.h" 46#include "cifsfs.h"
47 47
48#ifdef CIFS_NFSD_EXPORT 48#ifdef CONFIG_CIFS_NFSD_EXPORT
49static struct dentry *cifs_get_parent(struct dentry *dentry) 49static struct dentry *cifs_get_parent(struct dentry *dentry)
50{ 50{
51 /* BB need to add code here eventually to enable export via NFSD */ 51 /* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
63 .encode_fs = */ 63 .encode_fs = */
64}; 64};
65 65
66#endif /* CIFS_NFSD_EXPORT */ 66#endif /* CONFIG_CIFS_NFSD_EXPORT */
67 67
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a..cf0b1539b32 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/swap.h>
35#include <asm/div64.h> 36#include <asm/div64.h>
36#include "cifsfs.h" 37#include "cifsfs.h"
37#include "cifspdu.h" 38#include "cifspdu.h"
@@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
174 int rc; 175 int rc;
175 int desiredAccess; 176 int desiredAccess;
176 int disposition; 177 int disposition;
178 int create_options = CREATE_NOT_DIR;
177 FILE_ALL_INFO *buf; 179 FILE_ALL_INFO *buf;
178 180
179 desiredAccess = cifs_convert_flags(f_flags); 181 desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
210 if (!buf) 212 if (!buf)
211 return -ENOMEM; 213 return -ENOMEM;
212 214
215 if (backup_cred(cifs_sb))
216 create_options |= CREATE_OPEN_BACKUP_INTENT;
217
213 if (tcon->ses->capabilities & CAP_NT_SMBS) 218 if (tcon->ses->capabilities & CAP_NT_SMBS)
214 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 219 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
215 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, 220 desiredAccess, create_options, pnetfid, poplock, buf,
216 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
217 & CIFS_MOUNT_MAP_SPECIAL_CHR); 222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
218 else 223 else
@@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
258 pCifsFile->invalidHandle = false; 263 pCifsFile->invalidHandle = false;
259 pCifsFile->tlink = cifs_get_tlink(tlink); 264 pCifsFile->tlink = cifs_get_tlink(tlink);
260 mutex_init(&pCifsFile->fh_mutex); 265 mutex_init(&pCifsFile->fh_mutex);
261 mutex_init(&pCifsFile->lock_mutex);
262 INIT_LIST_HEAD(&pCifsFile->llist);
263 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); 266 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
264 267
265 spin_lock(&cifs_file_list_lock); 268 spin_lock(&cifs_file_list_lock);
@@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
272 spin_unlock(&cifs_file_list_lock); 275 spin_unlock(&cifs_file_list_lock);
273 276
274 cifs_set_oplock_level(pCifsInode, oplock); 277 cifs_set_oplock_level(pCifsInode, oplock);
278 pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
275 279
276 file->private_data = pCifsFile; 280 file->private_data = pCifsFile;
277 return pCifsFile; 281 return pCifsFile;
278} 282}
279 283
284static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
285
280/* 286/*
281 * Release a reference on the file private data. This may involve closing 287 * Release a reference on the file private data. This may involve closing
282 * the filehandle out on the server. Must be called without holding 288 * the filehandle out on the server. Must be called without holding
@@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
327 /* Delete any outstanding lock records. We'll lose them when the file 333 /* Delete any outstanding lock records. We'll lose them when the file
328 * is closed anyway. 334 * is closed anyway.
329 */ 335 */
330 mutex_lock(&cifs_file->lock_mutex); 336 mutex_lock(&cifsi->lock_mutex);
331 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { 337 list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
338 if (li->netfid != cifs_file->netfid)
339 continue;
332 list_del(&li->llist); 340 list_del(&li->llist);
341 cifs_del_lock_waiters(li);
333 kfree(li); 342 kfree(li);
334 } 343 }
335 mutex_unlock(&cifs_file->lock_mutex); 344 mutex_unlock(&cifsi->lock_mutex);
336 345
337 cifs_put_tlink(cifs_file->tlink); 346 cifs_put_tlink(cifs_file->tlink);
338 dput(cifs_file->dentry); 347 dput(cifs_file->dentry);
@@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
371 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 380 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
372 inode, file->f_flags, full_path); 381 inode, file->f_flags, full_path);
373 382
374 if (oplockEnabled) 383 if (enable_oplocks)
375 oplock = REQ_OPLOCK; 384 oplock = REQ_OPLOCK;
376 else 385 else
377 oplock = 0; 386 oplock = 0;
@@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
465 char *full_path = NULL; 474 char *full_path = NULL;
466 int desiredAccess; 475 int desiredAccess;
467 int disposition = FILE_OPEN; 476 int disposition = FILE_OPEN;
477 int create_options = CREATE_NOT_DIR;
468 __u16 netfid; 478 __u16 netfid;
469 479
470 xid = GetXid(); 480 xid = GetXid();
@@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
495 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 505 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
496 inode, pCifsFile->f_flags, full_path); 506 inode, pCifsFile->f_flags, full_path);
497 507
498 if (oplockEnabled) 508 if (enable_oplocks)
499 oplock = REQ_OPLOCK; 509 oplock = REQ_OPLOCK;
500 else 510 else
501 oplock = 0; 511 oplock = 0;
@@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
524 534
525 desiredAccess = cifs_convert_flags(pCifsFile->f_flags); 535 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
526 536
537 if (backup_cred(cifs_sb))
538 create_options |= CREATE_OPEN_BACKUP_INTENT;
539
527 /* Can not refresh inode by passing in file_info buf to be returned 540 /* Can not refresh inode by passing in file_info buf to be returned
528 by SMBOpen and then calling get_inode_info with returned buf 541 by SMBOpen and then calling get_inode_info with returned buf
529 since file might have write behind data that needs to be flushed 542 since file might have write behind data that needs to be flushed
@@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
531 that inode was not dirty locally we could do this */ 544 that inode was not dirty locally we could do this */
532 545
533 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, 546 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
534 CREATE_NOT_DIR, &netfid, &oplock, NULL, 547 create_options, &netfid, &oplock, NULL,
535 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 548 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
536 CIFS_MOUNT_MAP_SPECIAL_CHR); 549 CIFS_MOUNT_MAP_SPECIAL_CHR);
537 if (rc) { 550 if (rc) {
@@ -631,219 +644,687 @@ int cifs_closedir(struct inode *inode, struct file *file)
631 return rc; 644 return rc;
632} 645}
633 646
634static int store_file_lock(struct cifsFileInfo *fid, __u64 len, 647static struct cifsLockInfo *
635 __u64 offset, __u8 lockType) 648cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
636{ 649{
637 struct cifsLockInfo *li = 650 struct cifsLockInfo *lock =
638 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); 651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
639 if (li == NULL) 652 if (!lock)
640 return -ENOMEM; 653 return lock;
641 li->offset = offset; 654 lock->offset = offset;
642 li->length = len; 655 lock->length = length;
643 li->type = lockType; 656 lock->type = type;
644 mutex_lock(&fid->lock_mutex); 657 lock->netfid = netfid;
645 list_add(&li->llist, &fid->llist); 658 lock->pid = current->tgid;
646 mutex_unlock(&fid->lock_mutex); 659 INIT_LIST_HEAD(&lock->blist);
647 return 0; 660 init_waitqueue_head(&lock->block_q);
661 return lock;
648} 662}
649 663
650int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) 664static void
665cifs_del_lock_waiters(struct cifsLockInfo *lock)
651{ 666{
652 int rc, xid; 667 struct cifsLockInfo *li, *tmp;
653 __u32 numLock = 0; 668 list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
654 __u32 numUnlock = 0; 669 list_del_init(&li->blist);
655 __u64 length; 670 wake_up(&li->block_q);
656 bool wait_flag = false; 671 }
657 struct cifs_sb_info *cifs_sb; 672}
673
674static bool
675__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
676 __u64 length, __u8 type, __u16 netfid,
677 struct cifsLockInfo **conf_lock)
678{
679 struct cifsLockInfo *li, *tmp;
680
681 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
682 if (offset + length <= li->offset ||
683 offset >= li->offset + li->length)
684 continue;
685 else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
686 ((netfid == li->netfid && current->tgid == li->pid) ||
687 type == li->type))
688 continue;
689 else {
690 *conf_lock = li;
691 return true;
692 }
693 }
694 return false;
695}
696
697static bool
698cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
699 struct cifsLockInfo **conf_lock)
700{
701 return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
702 lock->type, lock->netfid, conf_lock);
703}
704
705static int
706cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
707 __u8 type, __u16 netfid, struct file_lock *flock)
708{
709 int rc = 0;
710 struct cifsLockInfo *conf_lock;
711 bool exist;
712
713 mutex_lock(&cinode->lock_mutex);
714
715 exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
716 &conf_lock);
717 if (exist) {
718 flock->fl_start = conf_lock->offset;
719 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
720 flock->fl_pid = conf_lock->pid;
721 if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
722 flock->fl_type = F_RDLCK;
723 else
724 flock->fl_type = F_WRLCK;
725 } else if (!cinode->can_cache_brlcks)
726 rc = 1;
727 else
728 flock->fl_type = F_UNLCK;
729
730 mutex_unlock(&cinode->lock_mutex);
731 return rc;
732}
733
734static void
735cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
736{
737 mutex_lock(&cinode->lock_mutex);
738 list_add_tail(&lock->llist, &cinode->llist);
739 mutex_unlock(&cinode->lock_mutex);
740}
741
742static int
743cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
744 bool wait)
745{
746 struct cifsLockInfo *conf_lock;
747 bool exist;
748 int rc = 0;
749
750try_again:
751 exist = false;
752 mutex_lock(&cinode->lock_mutex);
753
754 exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
755 if (!exist && cinode->can_cache_brlcks) {
756 list_add_tail(&lock->llist, &cinode->llist);
757 mutex_unlock(&cinode->lock_mutex);
758 return rc;
759 }
760
761 if (!exist)
762 rc = 1;
763 else if (!wait)
764 rc = -EACCES;
765 else {
766 list_add_tail(&lock->blist, &conf_lock->blist);
767 mutex_unlock(&cinode->lock_mutex);
768 rc = wait_event_interruptible(lock->block_q,
769 (lock->blist.prev == &lock->blist) &&
770 (lock->blist.next == &lock->blist));
771 if (!rc)
772 goto try_again;
773 mutex_lock(&cinode->lock_mutex);
774 list_del_init(&lock->blist);
775 }
776
777 mutex_unlock(&cinode->lock_mutex);
778 return rc;
779}
780
781static int
782cifs_posix_lock_test(struct file *file, struct file_lock *flock)
783{
784 int rc = 0;
785 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
786 unsigned char saved_type = flock->fl_type;
787
788 if ((flock->fl_flags & FL_POSIX) == 0)
789 return 1;
790
791 mutex_lock(&cinode->lock_mutex);
792 posix_test_lock(file, flock);
793
794 if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
795 flock->fl_type = saved_type;
796 rc = 1;
797 }
798
799 mutex_unlock(&cinode->lock_mutex);
800 return rc;
801}
802
803static int
804cifs_posix_lock_set(struct file *file, struct file_lock *flock)
805{
806 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
807 int rc = 1;
808
809 if ((flock->fl_flags & FL_POSIX) == 0)
810 return rc;
811
812 mutex_lock(&cinode->lock_mutex);
813 if (!cinode->can_cache_brlcks) {
814 mutex_unlock(&cinode->lock_mutex);
815 return rc;
816 }
817 rc = posix_lock_file_wait(file, flock);
818 mutex_unlock(&cinode->lock_mutex);
819 return rc;
820}
821
822static int
823cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
824{
825 int xid, rc = 0, stored_rc;
826 struct cifsLockInfo *li, *tmp;
658 struct cifs_tcon *tcon; 827 struct cifs_tcon *tcon;
659 __u16 netfid; 828 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
660 __u8 lockType = LOCKING_ANDX_LARGE_FILES; 829 unsigned int num, max_num;
661 bool posix_locking = 0; 830 LOCKING_ANDX_RANGE *buf, *cur;
831 int types[] = {LOCKING_ANDX_LARGE_FILES,
832 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
833 int i;
662 834
663 length = 1 + pfLock->fl_end - pfLock->fl_start;
664 rc = -EACCES;
665 xid = GetXid(); 835 xid = GetXid();
836 tcon = tlink_tcon(cfile->tlink);
666 837
667 cFYI(1, "Lock parm: 0x%x flockflags: " 838 mutex_lock(&cinode->lock_mutex);
668 "0x%x flocktype: 0x%x start: %lld end: %lld", 839 if (!cinode->can_cache_brlcks) {
669 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, 840 mutex_unlock(&cinode->lock_mutex);
670 pfLock->fl_end); 841 FreeXid(xid);
842 return rc;
843 }
671 844
672 if (pfLock->fl_flags & FL_POSIX) 845 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
846 sizeof(LOCKING_ANDX_RANGE);
847 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
848 if (!buf) {
849 mutex_unlock(&cinode->lock_mutex);
850 FreeXid(xid);
851 return rc;
852 }
853
854 for (i = 0; i < 2; i++) {
855 cur = buf;
856 num = 0;
857 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
858 if (li->type != types[i])
859 continue;
860 cur->Pid = cpu_to_le16(li->pid);
861 cur->LengthLow = cpu_to_le32((u32)li->length);
862 cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
863 cur->OffsetLow = cpu_to_le32((u32)li->offset);
864 cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
865 if (++num == max_num) {
866 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
867 li->type, 0, num, buf);
868 if (stored_rc)
869 rc = stored_rc;
870 cur = buf;
871 num = 0;
872 } else
873 cur++;
874 }
875
876 if (num) {
877 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
878 types[i], 0, num, buf);
879 if (stored_rc)
880 rc = stored_rc;
881 }
882 }
883
884 cinode->can_cache_brlcks = false;
885 mutex_unlock(&cinode->lock_mutex);
886
887 kfree(buf);
888 FreeXid(xid);
889 return rc;
890}
891
892/* copied from fs/locks.c with a name change */
893#define cifs_for_each_lock(inode, lockp) \
894 for (lockp = &inode->i_flock; *lockp != NULL; \
895 lockp = &(*lockp)->fl_next)
896
897static int
898cifs_push_posix_locks(struct cifsFileInfo *cfile)
899{
900 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
901 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
902 struct file_lock *flock, **before;
903 struct cifsLockInfo *lck, *tmp;
904 int rc = 0, xid, type;
905 __u64 length;
906 struct list_head locks_to_send;
907
908 xid = GetXid();
909
910 mutex_lock(&cinode->lock_mutex);
911 if (!cinode->can_cache_brlcks) {
912 mutex_unlock(&cinode->lock_mutex);
913 FreeXid(xid);
914 return rc;
915 }
916
917 INIT_LIST_HEAD(&locks_to_send);
918
919 lock_flocks();
920 cifs_for_each_lock(cfile->dentry->d_inode, before) {
921 flock = *before;
922 length = 1 + flock->fl_end - flock->fl_start;
923 if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
924 type = CIFS_RDLCK;
925 else
926 type = CIFS_WRLCK;
927
928 lck = cifs_lock_init(flock->fl_start, length, type,
929 cfile->netfid);
930 if (!lck) {
931 rc = -ENOMEM;
932 goto send_locks;
933 }
934 lck->pid = flock->fl_pid;
935
936 list_add_tail(&lck->llist, &locks_to_send);
937 }
938
939send_locks:
940 unlock_flocks();
941
942 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
943 struct file_lock tmp_lock;
944 int stored_rc;
945
946 tmp_lock.fl_start = lck->offset;
947 stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
948 0, lck->length, &tmp_lock,
949 lck->type, 0);
950 if (stored_rc)
951 rc = stored_rc;
952 list_del(&lck->llist);
953 kfree(lck);
954 }
955
956 cinode->can_cache_brlcks = false;
957 mutex_unlock(&cinode->lock_mutex);
958
959 FreeXid(xid);
960 return rc;
961}
962
963static int
964cifs_push_locks(struct cifsFileInfo *cfile)
965{
966 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
967 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
968
969 if ((tcon->ses->capabilities & CAP_UNIX) &&
970 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
971 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
972 return cifs_push_posix_locks(cfile);
973
974 return cifs_push_mandatory_locks(cfile);
975}
976
977static void
978cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
979 bool *wait_flag)
980{
981 if (flock->fl_flags & FL_POSIX)
673 cFYI(1, "Posix"); 982 cFYI(1, "Posix");
674 if (pfLock->fl_flags & FL_FLOCK) 983 if (flock->fl_flags & FL_FLOCK)
675 cFYI(1, "Flock"); 984 cFYI(1, "Flock");
676 if (pfLock->fl_flags & FL_SLEEP) { 985 if (flock->fl_flags & FL_SLEEP) {
677 cFYI(1, "Blocking lock"); 986 cFYI(1, "Blocking lock");
678 wait_flag = true; 987 *wait_flag = true;
679 } 988 }
680 if (pfLock->fl_flags & FL_ACCESS) 989 if (flock->fl_flags & FL_ACCESS)
681 cFYI(1, "Process suspended by mandatory locking - " 990 cFYI(1, "Process suspended by mandatory locking - "
682 "not implemented yet"); 991 "not implemented yet");
683 if (pfLock->fl_flags & FL_LEASE) 992 if (flock->fl_flags & FL_LEASE)
684 cFYI(1, "Lease on file - not implemented yet"); 993 cFYI(1, "Lease on file - not implemented yet");
685 if (pfLock->fl_flags & 994 if (flock->fl_flags &
686 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 995 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
687 cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); 996 cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
688 997
689 if (pfLock->fl_type == F_WRLCK) { 998 *type = LOCKING_ANDX_LARGE_FILES;
999 if (flock->fl_type == F_WRLCK) {
690 cFYI(1, "F_WRLCK "); 1000 cFYI(1, "F_WRLCK ");
691 numLock = 1; 1001 *lock = 1;
692 } else if (pfLock->fl_type == F_UNLCK) { 1002 } else if (flock->fl_type == F_UNLCK) {
693 cFYI(1, "F_UNLCK"); 1003 cFYI(1, "F_UNLCK");
694 numUnlock = 1; 1004 *unlock = 1;
695 /* Check if unlock includes more than 1005 /* Check if unlock includes more than one lock range */
696 one lock range */ 1006 } else if (flock->fl_type == F_RDLCK) {
697 } else if (pfLock->fl_type == F_RDLCK) {
698 cFYI(1, "F_RDLCK"); 1007 cFYI(1, "F_RDLCK");
699 lockType |= LOCKING_ANDX_SHARED_LOCK; 1008 *type |= LOCKING_ANDX_SHARED_LOCK;
700 numLock = 1; 1009 *lock = 1;
701 } else if (pfLock->fl_type == F_EXLCK) { 1010 } else if (flock->fl_type == F_EXLCK) {
702 cFYI(1, "F_EXLCK"); 1011 cFYI(1, "F_EXLCK");
703 numLock = 1; 1012 *lock = 1;
704 } else if (pfLock->fl_type == F_SHLCK) { 1013 } else if (flock->fl_type == F_SHLCK) {
705 cFYI(1, "F_SHLCK"); 1014 cFYI(1, "F_SHLCK");
706 lockType |= LOCKING_ANDX_SHARED_LOCK; 1015 *type |= LOCKING_ANDX_SHARED_LOCK;
707 numLock = 1; 1016 *lock = 1;
708 } else 1017 } else
709 cFYI(1, "Unknown type of lock"); 1018 cFYI(1, "Unknown type of lock");
1019}
710 1020
711 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1021static int
712 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); 1022cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
713 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 1023 bool wait_flag, bool posix_lck, int xid)
1024{
1025 int rc = 0;
1026 __u64 length = 1 + flock->fl_end - flock->fl_start;
1027 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1028 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1029 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1030 __u16 netfid = cfile->netfid;
714 1031
715 if ((tcon->ses->capabilities & CAP_UNIX) && 1032 if (posix_lck) {
716 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 1033 int posix_lock_type;
717 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1034
718 posix_locking = 1; 1035 rc = cifs_posix_lock_test(file, flock);
719 /* BB add code here to normalize offset and length to 1036 if (!rc)
720 account for negative length which we can not accept over the
721 wire */
722 if (IS_GETLK(cmd)) {
723 if (posix_locking) {
724 int posix_lock_type;
725 if (lockType & LOCKING_ANDX_SHARED_LOCK)
726 posix_lock_type = CIFS_RDLCK;
727 else
728 posix_lock_type = CIFS_WRLCK;
729 rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
730 length, pfLock, posix_lock_type,
731 wait_flag);
732 FreeXid(xid);
733 return rc; 1037 return rc;
734 }
735 1038
736 /* BB we could chain these into one lock request BB */ 1039 if (type & LOCKING_ANDX_SHARED_LOCK)
737 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1040 posix_lock_type = CIFS_RDLCK;
738 0, 1, lockType, 0 /* wait flag */, 0); 1041 else
739 if (rc == 0) { 1042 posix_lock_type = CIFS_WRLCK;
740 rc = CIFSSMBLock(xid, tcon, netfid, length, 1043 rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
741 pfLock->fl_start, 1 /* numUnlock */ , 1044 1 /* get */, length, flock,
742 0 /* numLock */ , lockType, 1045 posix_lock_type, wait_flag);
743 0 /* wait flag */, 0); 1046 return rc;
744 pfLock->fl_type = F_UNLCK; 1047 }
745 if (rc != 0)
746 cERROR(1, "Error unlocking previously locked "
747 "range %d during test of lock", rc);
748 rc = 0;
749 1048
750 } else { 1049 rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
751 /* if rc == ERR_SHARING_VIOLATION ? */ 1050 flock);
752 rc = 0; 1051 if (!rc)
1052 return rc;
1053
1054 /* BB we could chain these into one lock request BB */
1055 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
1056 flock->fl_start, 0, 1, type, 0, 0);
1057 if (rc == 0) {
1058 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
1059 length, flock->fl_start, 1, 0,
1060 type, 0, 0);
1061 flock->fl_type = F_UNLCK;
1062 if (rc != 0)
1063 cERROR(1, "Error unlocking previously locked "
1064 "range %d during test of lock", rc);
1065 return 0;
1066 }
1067
1068 if (type & LOCKING_ANDX_SHARED_LOCK) {
1069 flock->fl_type = F_WRLCK;
1070 return 0;
1071 }
1072
1073 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
1074 flock->fl_start, 0, 1,
1075 type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
1076 if (rc == 0) {
1077 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
1078 length, flock->fl_start, 1, 0,
1079 type | LOCKING_ANDX_SHARED_LOCK,
1080 0, 0);
1081 flock->fl_type = F_RDLCK;
1082 if (rc != 0)
1083 cERROR(1, "Error unlocking previously locked "
1084 "range %d during test of lock", rc);
1085 } else
1086 flock->fl_type = F_WRLCK;
1087
1088 return 0;
1089}
1090
1091static void
1092cifs_move_llist(struct list_head *source, struct list_head *dest)
1093{
1094 struct list_head *li, *tmp;
1095 list_for_each_safe(li, tmp, source)
1096 list_move(li, dest);
1097}
753 1098
754 if (lockType & LOCKING_ANDX_SHARED_LOCK) { 1099static void
755 pfLock->fl_type = F_WRLCK; 1100cifs_free_llist(struct list_head *llist)
1101{
1102 struct cifsLockInfo *li, *tmp;
1103 list_for_each_entry_safe(li, tmp, llist, llist) {
1104 cifs_del_lock_waiters(li);
1105 list_del(&li->llist);
1106 kfree(li);
1107 }
1108}
1109
1110static int
1111cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1112{
1113 int rc = 0, stored_rc;
1114 int types[] = {LOCKING_ANDX_LARGE_FILES,
1115 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
1116 unsigned int i;
1117 unsigned int max_num, num;
1118 LOCKING_ANDX_RANGE *buf, *cur;
1119 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1120 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1121 struct cifsLockInfo *li, *tmp;
1122 __u64 length = 1 + flock->fl_end - flock->fl_start;
1123 struct list_head tmp_llist;
1124
1125 INIT_LIST_HEAD(&tmp_llist);
1126
1127 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
1128 sizeof(LOCKING_ANDX_RANGE);
1129 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
1130 if (!buf)
1131 return -ENOMEM;
1132
1133 mutex_lock(&cinode->lock_mutex);
1134 for (i = 0; i < 2; i++) {
1135 cur = buf;
1136 num = 0;
1137 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
1138 if (flock->fl_start > li->offset ||
1139 (flock->fl_start + length) <
1140 (li->offset + li->length))
1141 continue;
1142 if (current->tgid != li->pid)
1143 continue;
1144 if (cfile->netfid != li->netfid)
1145 continue;
1146 if (types[i] != li->type)
1147 continue;
1148 if (!cinode->can_cache_brlcks) {
1149 cur->Pid = cpu_to_le16(li->pid);
1150 cur->LengthLow = cpu_to_le32((u32)li->length);
1151 cur->LengthHigh =
1152 cpu_to_le32((u32)(li->length>>32));
1153 cur->OffsetLow = cpu_to_le32((u32)li->offset);
1154 cur->OffsetHigh =
1155 cpu_to_le32((u32)(li->offset>>32));
1156 /*
1157 * We need to save a lock here to let us add
1158 * it again to the inode list if the unlock
1159 * range request fails on the server.
1160 */
1161 list_move(&li->llist, &tmp_llist);
1162 if (++num == max_num) {
1163 stored_rc = cifs_lockv(xid, tcon,
1164 cfile->netfid,
1165 li->type, num,
1166 0, buf);
1167 if (stored_rc) {
1168 /*
1169 * We failed on the unlock range
1170 * request - add all locks from
1171 * the tmp list to the head of
1172 * the inode list.
1173 */
1174 cifs_move_llist(&tmp_llist,
1175 &cinode->llist);
1176 rc = stored_rc;
1177 } else
1178 /*
1179 * The unlock range request
1180 * succeed - free the tmp list.
1181 */
1182 cifs_free_llist(&tmp_llist);
1183 cur = buf;
1184 num = 0;
1185 } else
1186 cur++;
756 } else { 1187 } else {
757 rc = CIFSSMBLock(xid, tcon, netfid, length, 1188 /*
758 pfLock->fl_start, 0, 1, 1189 * We can cache brlock requests - simply remove
759 lockType | LOCKING_ANDX_SHARED_LOCK, 1190 * a lock from the inode list.
760 0 /* wait flag */, 0); 1191 */
761 if (rc == 0) { 1192 list_del(&li->llist);
762 rc = CIFSSMBLock(xid, tcon, netfid, 1193 cifs_del_lock_waiters(li);
763 length, pfLock->fl_start, 1, 0, 1194 kfree(li);
764 lockType |
765 LOCKING_ANDX_SHARED_LOCK,
766 0 /* wait flag */, 0);
767 pfLock->fl_type = F_RDLCK;
768 if (rc != 0)
769 cERROR(1, "Error unlocking "
770 "previously locked range %d "
771 "during test of lock", rc);
772 rc = 0;
773 } else {
774 pfLock->fl_type = F_WRLCK;
775 rc = 0;
776 }
777 } 1195 }
778 } 1196 }
779 1197 if (num) {
780 FreeXid(xid); 1198 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
781 return rc; 1199 types[i], num, 0, buf);
1200 if (stored_rc) {
1201 cifs_move_llist(&tmp_llist, &cinode->llist);
1202 rc = stored_rc;
1203 } else
1204 cifs_free_llist(&tmp_llist);
1205 }
782 } 1206 }
783 1207
784 if (!numLock && !numUnlock) { 1208 mutex_unlock(&cinode->lock_mutex);
785 /* if no lock or unlock then nothing 1209 kfree(buf);
786 to do since we do not know what it is */ 1210 return rc;
787 FreeXid(xid); 1211}
788 return -EOPNOTSUPP; 1212
789 } 1213static int
1214cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
1215 bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
1216{
1217 int rc = 0;
1218 __u64 length = 1 + flock->fl_end - flock->fl_start;
1219 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1220 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1221 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
1222 __u16 netfid = cfile->netfid;
790 1223
791 if (posix_locking) { 1224 if (posix_lck) {
792 int posix_lock_type; 1225 int posix_lock_type;
793 if (lockType & LOCKING_ANDX_SHARED_LOCK) 1226
1227 rc = cifs_posix_lock_set(file, flock);
1228 if (!rc || rc < 0)
1229 return rc;
1230
1231 if (type & LOCKING_ANDX_SHARED_LOCK)
794 posix_lock_type = CIFS_RDLCK; 1232 posix_lock_type = CIFS_RDLCK;
795 else 1233 else
796 posix_lock_type = CIFS_WRLCK; 1234 posix_lock_type = CIFS_WRLCK;
797 1235
798 if (numUnlock == 1) 1236 if (unlock == 1)
799 posix_lock_type = CIFS_UNLCK; 1237 posix_lock_type = CIFS_UNLCK;
800 1238
801 rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, 1239 rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
802 length, pfLock, posix_lock_type, 1240 0 /* set */, length, flock,
803 wait_flag); 1241 posix_lock_type, wait_flag);
804 } else { 1242 goto out;
805 struct cifsFileInfo *fid = file->private_data; 1243 }
806 1244
807 if (numLock) { 1245 if (lock) {
808 rc = CIFSSMBLock(xid, tcon, netfid, length, 1246 struct cifsLockInfo *lock;
809 pfLock->fl_start, 0, numLock, lockType,
810 wait_flag, 0);
811 1247
812 if (rc == 0) { 1248 lock = cifs_lock_init(flock->fl_start, length, type, netfid);
813 /* For Windows locks we must store them. */ 1249 if (!lock)
814 rc = store_file_lock(fid, length, 1250 return -ENOMEM;
815 pfLock->fl_start, lockType);
816 }
817 } else if (numUnlock) {
818 /* For each stored lock that this unlock overlaps
819 completely, unlock it. */
820 int stored_rc = 0;
821 struct cifsLockInfo *li, *tmp;
822 1251
823 rc = 0; 1252 rc = cifs_lock_add_if(cinode, lock, wait_flag);
824 mutex_lock(&fid->lock_mutex); 1253 if (rc < 0)
825 list_for_each_entry_safe(li, tmp, &fid->llist, llist) { 1254 kfree(lock);
826 if (pfLock->fl_start <= li->offset && 1255 if (rc <= 0)
827 (pfLock->fl_start + length) >= 1256 goto out;
828 (li->offset + li->length)) { 1257
829 stored_rc = CIFSSMBLock(xid, tcon, 1258 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
830 netfid, li->length, 1259 flock->fl_start, 0, 1, type, wait_flag, 0);
831 li->offset, 1, 0, 1260 if (rc) {
832 li->type, false, 0); 1261 kfree(lock);
833 if (stored_rc) 1262 goto out;
834 rc = stored_rc;
835 else {
836 list_del(&li->llist);
837 kfree(li);
838 }
839 }
840 }
841 mutex_unlock(&fid->lock_mutex);
842 } 1263 }
1264
1265 cifs_lock_add(cinode, lock);
1266 } else if (unlock)
1267 rc = cifs_unlock_range(cfile, flock, xid);
1268
1269out:
1270 if (flock->fl_flags & FL_POSIX)
1271 posix_lock_file_wait(file, flock);
1272 return rc;
1273}
1274
1275int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
1276{
1277 int rc, xid;
1278 int lock = 0, unlock = 0;
1279 bool wait_flag = false;
1280 bool posix_lck = false;
1281 struct cifs_sb_info *cifs_sb;
1282 struct cifs_tcon *tcon;
1283 struct cifsInodeInfo *cinode;
1284 struct cifsFileInfo *cfile;
1285 __u16 netfid;
1286 __u8 type;
1287
1288 rc = -EACCES;
1289 xid = GetXid();
1290
1291 cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
1292 "end: %lld", cmd, flock->fl_flags, flock->fl_type,
1293 flock->fl_start, flock->fl_end);
1294
1295 cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
1296
1297 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1298 cfile = (struct cifsFileInfo *)file->private_data;
1299 tcon = tlink_tcon(cfile->tlink);
1300 netfid = cfile->netfid;
1301 cinode = CIFS_I(file->f_path.dentry->d_inode);
1302
1303 if ((tcon->ses->capabilities & CAP_UNIX) &&
1304 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
1305 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1306 posix_lck = true;
1307 /*
1308 * BB add code here to normalize offset and length to account for
1309 * negative length which we can not accept over the wire.
1310 */
1311 if (IS_GETLK(cmd)) {
1312 rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
1313 FreeXid(xid);
1314 return rc;
843 } 1315 }
844 1316
845 if (pfLock->fl_flags & FL_POSIX) 1317 if (!lock && !unlock) {
846 posix_lock_file_wait(file, pfLock); 1318 /*
1319 * if no lock or unlock then nothing to do since we do not
1320 * know what it is
1321 */
1322 FreeXid(xid);
1323 return -EOPNOTSUPP;
1324 }
1325
1326 rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
1327 xid);
847 FreeXid(xid); 1328 FreeXid(xid);
848 return rc; 1329 return rc;
849} 1330}
@@ -1714,6 +2195,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1714 struct smb_com_read_rsp *pSMBr; 2195 struct smb_com_read_rsp *pSMBr;
1715 struct cifs_io_parms io_parms; 2196 struct cifs_io_parms io_parms;
1716 char *read_data; 2197 char *read_data;
2198 unsigned int rsize;
1717 __u32 pid; 2199 __u32 pid;
1718 2200
1719 if (!nr_segs) 2201 if (!nr_segs)
@@ -1726,6 +2208,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1726 xid = GetXid(); 2208 xid = GetXid();
1727 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2209 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1728 2210
2211 /* FIXME: set up handlers for larger reads and/or convert to async */
2212 rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
2213
1729 open_file = file->private_data; 2214 open_file = file->private_data;
1730 pTcon = tlink_tcon(open_file->tlink); 2215 pTcon = tlink_tcon(open_file->tlink);
1731 2216
@@ -1738,7 +2223,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1738 cFYI(1, "attempting read on write only file instance"); 2223 cFYI(1, "attempting read on write only file instance");
1739 2224
1740 for (total_read = 0; total_read < len; total_read += bytes_read) { 2225 for (total_read = 0; total_read < len; total_read += bytes_read) {
1741 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); 2226 cur_len = min_t(const size_t, len - total_read, rsize);
1742 rc = -EAGAIN; 2227 rc = -EAGAIN;
1743 read_data = NULL; 2228 read_data = NULL;
1744 2229
@@ -1830,6 +2315,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1830 unsigned int bytes_read = 0; 2315 unsigned int bytes_read = 0;
1831 unsigned int total_read; 2316 unsigned int total_read;
1832 unsigned int current_read_size; 2317 unsigned int current_read_size;
2318 unsigned int rsize;
1833 struct cifs_sb_info *cifs_sb; 2319 struct cifs_sb_info *cifs_sb;
1834 struct cifs_tcon *pTcon; 2320 struct cifs_tcon *pTcon;
1835 int xid; 2321 int xid;
@@ -1842,6 +2328,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1842 xid = GetXid(); 2328 xid = GetXid();
1843 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2329 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1844 2330
2331 /* FIXME: set up handlers for larger reads and/or convert to async */
2332 rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
2333
1845 if (file->private_data == NULL) { 2334 if (file->private_data == NULL) {
1846 rc = -EBADF; 2335 rc = -EBADF;
1847 FreeXid(xid); 2336 FreeXid(xid);
@@ -1861,14 +2350,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1861 for (total_read = 0, current_offset = read_data; 2350 for (total_read = 0, current_offset = read_data;
1862 read_size > total_read; 2351 read_size > total_read;
1863 total_read += bytes_read, current_offset += bytes_read) { 2352 total_read += bytes_read, current_offset += bytes_read) {
1864 current_read_size = min_t(const int, read_size - total_read, 2353 current_read_size = min_t(uint, read_size - total_read, rsize);
1865 cifs_sb->rsize); 2354
1866 /* For windows me and 9x we do not want to request more 2355 /* For windows me and 9x we do not want to request more
1867 than it negotiated since it will refuse the read then */ 2356 than it negotiated since it will refuse the read then */
1868 if ((pTcon->ses) && 2357 if ((pTcon->ses) &&
1869 !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { 2358 !(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
1870 current_read_size = min_t(const int, current_read_size, 2359 current_read_size = min_t(uint, current_read_size,
1871 pTcon->ses->server->maxBuf - 128); 2360 CIFSMaxBufSize);
1872 } 2361 }
1873 rc = -EAGAIN; 2362 rc = -EAGAIN;
1874 while (rc == -EAGAIN) { 2363 while (rc == -EAGAIN) {
@@ -1957,82 +2446,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1957 return rc; 2446 return rc;
1958} 2447}
1959 2448
1960
1961static void cifs_copy_cache_pages(struct address_space *mapping,
1962 struct list_head *pages, int bytes_read, char *data)
1963{
1964 struct page *page;
1965 char *target;
1966
1967 while (bytes_read > 0) {
1968 if (list_empty(pages))
1969 break;
1970
1971 page = list_entry(pages->prev, struct page, lru);
1972 list_del(&page->lru);
1973
1974 if (add_to_page_cache_lru(page, mapping, page->index,
1975 GFP_KERNEL)) {
1976 page_cache_release(page);
1977 cFYI(1, "Add page cache failed");
1978 data += PAGE_CACHE_SIZE;
1979 bytes_read -= PAGE_CACHE_SIZE;
1980 continue;
1981 }
1982 page_cache_release(page);
1983
1984 target = kmap_atomic(page, KM_USER0);
1985
1986 if (PAGE_CACHE_SIZE > bytes_read) {
1987 memcpy(target, data, bytes_read);
1988 /* zero the tail end of this partial page */
1989 memset(target + bytes_read, 0,
1990 PAGE_CACHE_SIZE - bytes_read);
1991 bytes_read = 0;
1992 } else {
1993 memcpy(target, data, PAGE_CACHE_SIZE);
1994 bytes_read -= PAGE_CACHE_SIZE;
1995 }
1996 kunmap_atomic(target, KM_USER0);
1997
1998 flush_dcache_page(page);
1999 SetPageUptodate(page);
2000 unlock_page(page);
2001 data += PAGE_CACHE_SIZE;
2002
2003 /* add page to FS-Cache */
2004 cifs_readpage_to_fscache(mapping->host, page);
2005 }
2006 return;
2007}
2008
2009static int cifs_readpages(struct file *file, struct address_space *mapping, 2449static int cifs_readpages(struct file *file, struct address_space *mapping,
2010 struct list_head *page_list, unsigned num_pages) 2450 struct list_head *page_list, unsigned num_pages)
2011{ 2451{
2012 int rc = -EACCES; 2452 int rc;
2013 int xid; 2453 struct list_head tmplist;
2014 loff_t offset; 2454 struct cifsFileInfo *open_file = file->private_data;
2015 struct page *page; 2455 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2016 struct cifs_sb_info *cifs_sb; 2456 unsigned int rsize = cifs_sb->rsize;
2017 struct cifs_tcon *pTcon; 2457 pid_t pid;
2018 unsigned int bytes_read = 0;
2019 unsigned int read_size, i;
2020 char *smb_read_data = NULL;
2021 struct smb_com_read_rsp *pSMBr;
2022 struct cifsFileInfo *open_file;
2023 struct cifs_io_parms io_parms;
2024 int buf_type = CIFS_NO_BUFFER;
2025 __u32 pid;
2026 2458
2027 xid = GetXid(); 2459 /*
2028 if (file->private_data == NULL) { 2460 * Give up immediately if rsize is too small to read an entire page.
2029 rc = -EBADF; 2461 * The VFS will fall back to readpage. We should never reach this
2030 FreeXid(xid); 2462 * point however since we set ra_pages to 0 when the rsize is smaller
2031 return rc; 2463 * than a cache page.
2032 } 2464 */
2033 open_file = file->private_data; 2465 if (unlikely(rsize < PAGE_CACHE_SIZE))
2034 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2466 return 0;
2035 pTcon = tlink_tcon(open_file->tlink);
2036 2467
2037 /* 2468 /*
2038 * Reads as many pages as possible from fscache. Returns -ENOBUFS 2469 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2041,125 +2472,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2041 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, 2472 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
2042 &num_pages); 2473 &num_pages);
2043 if (rc == 0) 2474 if (rc == 0)
2044 goto read_complete; 2475 return rc;
2045 2476
2046 cFYI(DBG2, "rpages: num pages %d", num_pages);
2047 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2477 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2048 pid = open_file->pid; 2478 pid = open_file->pid;
2049 else 2479 else
2050 pid = current->tgid; 2480 pid = current->tgid;
2051 2481
2052 for (i = 0; i < num_pages; ) { 2482 rc = 0;
2053 unsigned contig_pages; 2483 INIT_LIST_HEAD(&tmplist);
2054 struct page *tmp_page;
2055 unsigned long expected_index;
2056 2484
2057 if (list_empty(page_list)) 2485 cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
2058 break; 2486 mapping, num_pages);
2487
2488 /*
2489 * Start with the page at end of list and move it to private
2490 * list. Do the same with any following pages until we hit
2491 * the rsize limit, hit an index discontinuity, or run out of
2492 * pages. Issue the async read and then start the loop again
2493 * until the list is empty.
2494 *
2495 * Note that list order is important. The page_list is in
2496 * the order of declining indexes. When we put the pages in
2497 * the rdata->pages, then we want them in increasing order.
2498 */
2499 while (!list_empty(page_list)) {
2500 unsigned int bytes = PAGE_CACHE_SIZE;
2501 unsigned int expected_index;
2502 unsigned int nr_pages = 1;
2503 loff_t offset;
2504 struct page *page, *tpage;
2505 struct cifs_readdata *rdata;
2059 2506
2060 page = list_entry(page_list->prev, struct page, lru); 2507 page = list_entry(page_list->prev, struct page, lru);
2508
2509 /*
2510 * Lock the page and put it in the cache. Since no one else
2511 * should have access to this page, we're safe to simply set
2512 * PG_locked without checking it first.
2513 */
2514 __set_page_locked(page);
2515 rc = add_to_page_cache_locked(page, mapping,
2516 page->index, GFP_KERNEL);
2517
2518 /* give up if we can't stick it in the cache */
2519 if (rc) {
2520 __clear_page_locked(page);
2521 break;
2522 }
2523
2524 /* move first page to the tmplist */
2061 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 2525 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
2526 list_move_tail(&page->lru, &tmplist);
2062 2527
2063 /* count adjacent pages that we will read into */ 2528 /* now try and add more pages onto the request */
2064 contig_pages = 0; 2529 expected_index = page->index + 1;
2065 expected_index = 2530 list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
2066 list_entry(page_list->prev, struct page, lru)->index; 2531 /* discontinuity ? */
2067 list_for_each_entry_reverse(tmp_page, page_list, lru) { 2532 if (page->index != expected_index)
2068 if (tmp_page->index == expected_index) {
2069 contig_pages++;
2070 expected_index++;
2071 } else
2072 break; 2533 break;
2534
2535 /* would this page push the read over the rsize? */
2536 if (bytes + PAGE_CACHE_SIZE > rsize)
2537 break;
2538
2539 __set_page_locked(page);
2540 if (add_to_page_cache_locked(page, mapping,
2541 page->index, GFP_KERNEL)) {
2542 __clear_page_locked(page);
2543 break;
2544 }
2545 list_move_tail(&page->lru, &tmplist);
2546 bytes += PAGE_CACHE_SIZE;
2547 expected_index++;
2548 nr_pages++;
2073 } 2549 }
2074 if (contig_pages + i > num_pages) 2550
2075 contig_pages = num_pages - i; 2551 rdata = cifs_readdata_alloc(nr_pages);
2076 2552 if (!rdata) {
2077 /* for reads over a certain size could initiate async 2553 /* best to give up if we're out of mem */
2078 read ahead */ 2554 list_for_each_entry_safe(page, tpage, &tmplist, lru) {
2079 2555 list_del(&page->lru);
2080 read_size = contig_pages * PAGE_CACHE_SIZE; 2556 lru_cache_add_file(page);
2081 /* Read size needs to be in multiples of one page */ 2557 unlock_page(page);
2082 read_size = min_t(const unsigned int, read_size, 2558 page_cache_release(page);
2083 cifs_sb->rsize & PAGE_CACHE_MASK); 2559 }
2084 cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", 2560 rc = -ENOMEM;
2085 read_size, contig_pages); 2561 break;
2086 rc = -EAGAIN; 2562 }
2087 while (rc == -EAGAIN) { 2563
2564 spin_lock(&cifs_file_list_lock);
2565 cifsFileInfo_get(open_file);
2566 spin_unlock(&cifs_file_list_lock);
2567 rdata->cfile = open_file;
2568 rdata->mapping = mapping;
2569 rdata->offset = offset;
2570 rdata->bytes = bytes;
2571 rdata->pid = pid;
2572 list_splice_init(&tmplist, &rdata->pages);
2573
2574 do {
2088 if (open_file->invalidHandle) { 2575 if (open_file->invalidHandle) {
2089 rc = cifs_reopen_file(open_file, true); 2576 rc = cifs_reopen_file(open_file, true);
2090 if (rc != 0) 2577 if (rc != 0)
2091 break; 2578 continue;
2092 } 2579 }
2093 io_parms.netfid = open_file->netfid; 2580 rc = cifs_async_readv(rdata);
2094 io_parms.pid = pid; 2581 } while (rc == -EAGAIN);
2095 io_parms.tcon = pTcon;
2096 io_parms.offset = offset;
2097 io_parms.length = read_size;
2098 rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
2099 &smb_read_data, &buf_type);
2100 /* BB more RC checks ? */
2101 if (rc == -EAGAIN) {
2102 if (smb_read_data) {
2103 if (buf_type == CIFS_SMALL_BUFFER)
2104 cifs_small_buf_release(smb_read_data);
2105 else if (buf_type == CIFS_LARGE_BUFFER)
2106 cifs_buf_release(smb_read_data);
2107 smb_read_data = NULL;
2108 }
2109 }
2110 }
2111 if ((rc < 0) || (smb_read_data == NULL)) {
2112 cFYI(1, "Read error in readpages: %d", rc);
2113 break;
2114 } else if (bytes_read > 0) {
2115 task_io_account_read(bytes_read);
2116 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
2117 cifs_copy_cache_pages(mapping, page_list, bytes_read,
2118 smb_read_data + 4 /* RFC1001 hdr */ +
2119 le16_to_cpu(pSMBr->DataOffset));
2120
2121 i += bytes_read >> PAGE_CACHE_SHIFT;
2122 cifs_stats_bytes_read(pTcon, bytes_read);
2123 if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
2124 i++; /* account for partial page */
2125
2126 /* server copy of file can have smaller size
2127 than client */
2128 /* BB do we need to verify this common case ?
2129 this case is ok - if we are at server EOF
2130 we will hit it on next read */
2131 2582
2132 /* break; */ 2583 if (rc != 0) {
2584 list_for_each_entry_safe(page, tpage, &rdata->pages,
2585 lru) {
2586 list_del(&page->lru);
2587 lru_cache_add_file(page);
2588 unlock_page(page);
2589 page_cache_release(page);
2133 } 2590 }
2134 } else { 2591 cifs_readdata_free(rdata);
2135 cFYI(1, "No bytes read (%d) at offset %lld . "
2136 "Cleaning remaining pages from readahead list",
2137 bytes_read, offset);
2138 /* BB turn off caching and do new lookup on
2139 file size at server? */
2140 break; 2592 break;
2141 } 2593 }
2142 if (smb_read_data) {
2143 if (buf_type == CIFS_SMALL_BUFFER)
2144 cifs_small_buf_release(smb_read_data);
2145 else if (buf_type == CIFS_LARGE_BUFFER)
2146 cifs_buf_release(smb_read_data);
2147 smb_read_data = NULL;
2148 }
2149 bytes_read = 0;
2150 } 2594 }
2151 2595
2152/* need to free smb_read_data buf before exit */
2153 if (smb_read_data) {
2154 if (buf_type == CIFS_SMALL_BUFFER)
2155 cifs_small_buf_release(smb_read_data);
2156 else if (buf_type == CIFS_LARGE_BUFFER)
2157 cifs_buf_release(smb_read_data);
2158 smb_read_data = NULL;
2159 }
2160
2161read_complete:
2162 FreeXid(xid);
2163 return rc; 2596 return rc;
2164} 2597}
2165 2598
@@ -2408,6 +2841,10 @@ void cifs_oplock_break(struct work_struct *work)
2408 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 2841 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2409 } 2842 }
2410 2843
2844 rc = cifs_push_locks(cfile);
2845 if (rc)
2846 cERROR(1, "Push locks rc = %d", rc);
2847
2411 /* 2848 /*
2412 * releasing stale oplock after recent reconnect of smb session using 2849 * releasing stale oplock after recent reconnect of smb session using
2413 * a now incorrect file handle is not a data integrity issue but do 2850 * a now incorrect file handle is not a data integrity issue but do
@@ -2415,8 +2852,9 @@ void cifs_oplock_break(struct work_struct *work)
2415 * disconnected since oplock already released by the server 2852 * disconnected since oplock already released by the server
2416 */ 2853 */
2417 if (!cfile->oplock_break_cancelled) { 2854 if (!cfile->oplock_break_cancelled) {
2418 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, 2855 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
2419 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, 2856 current->tgid, 0, 0, 0, 0,
2857 LOCKING_ANDX_OPLOCK_RELEASE, false,
2420 cinode->clientCanCacheRead ? 1 : 0); 2858 cinode->clientCanCacheRead ? 1 : 0);
2421 cFYI(1, "Oplock release rc = %d", rc); 2859 cFYI(1, "Oplock release rc = %d", rc);
2422 } 2860 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53..e851d5b8931 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
132 inode->i_mtime = fattr->cf_mtime; 132 inode->i_mtime = fattr->cf_mtime;
133 inode->i_ctime = fattr->cf_ctime; 133 inode->i_ctime = fattr->cf_ctime;
134 inode->i_rdev = fattr->cf_rdev; 134 inode->i_rdev = fattr->cf_rdev;
135 inode->i_nlink = fattr->cf_nlink; 135 set_nlink(inode, fattr->cf_nlink);
136 inode->i_uid = fattr->cf_uid; 136 inode->i_uid = fattr->cf_uid;
137 inode->i_gid = fattr->cf_gid; 137 inode->i_gid = fattr->cf_gid;
138 138
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
562 562
563 xid = GetXid(); 563 xid = GetXid();
564 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 564 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
565 if (rc == -EOPNOTSUPP || rc == -EINVAL) { 565 switch (rc) {
566 case 0:
567 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
568 break;
569 case -EREMOTE:
570 cifs_create_dfs_fattr(&fattr, inode->i_sb);
571 rc = 0;
572 break;
573 case -EOPNOTSUPP:
574 case -EINVAL:
566 /* 575 /*
567 * FIXME: legacy server -- fall back to path-based call? 576 * FIXME: legacy server -- fall back to path-based call?
568 * for now, just skip revalidating and mark inode for 577 * for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
570 */ 579 */
571 rc = 0; 580 rc = 0;
572 CIFS_I(inode)->time = 0; 581 CIFS_I(inode)->time = 0;
582 default:
573 goto cgfi_exit; 583 goto cgfi_exit;
574 } else if (rc == -EREMOTE) { 584 }
575 cifs_create_dfs_fattr(&fattr, inode->i_sb);
576 rc = 0;
577 } else if (rc)
578 goto cgfi_exit;
579 585
580 /* 586 /*
581 * don't bother with SFU junk here -- just mark inode as needing 587 * don't bother with SFU junk here -- just mark inode as needing
582 * revalidation. 588 * revalidation.
583 */ 589 */
584 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
585 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; 590 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
586 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 591 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
587 cifs_fattr_to_inode(inode, &fattr); 592 cifs_fattr_to_inode(inode, &fattr);
@@ -900,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
900 if (rc && tcon->ipc) { 905 if (rc && tcon->ipc) {
901 cFYI(1, "ipc connection - fake read inode"); 906 cFYI(1, "ipc connection - fake read inode");
902 inode->i_mode |= S_IFDIR; 907 inode->i_mode |= S_IFDIR;
903 inode->i_nlink = 2; 908 set_nlink(inode, 2);
904 inode->i_op = &cifs_ipc_inode_ops; 909 inode->i_op = &cifs_ipc_inode_ops;
905 inode->i_fop = &simple_dir_operations; 910 inode->i_fop = &simple_dir_operations;
906 inode->i_uid = cifs_sb->mnt_uid; 911 inode->i_uid = cifs_sb->mnt_uid;
@@ -1362,7 +1367,7 @@ mkdir_get_info:
1362 /* setting nlink not necessary except in cases where we 1367 /* setting nlink not necessary except in cases where we
1363 * failed to get it from the server or was set bogus */ 1368 * failed to get it from the server or was set bogus */
1364 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) 1369 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1365 direntry->d_inode->i_nlink = 2; 1370 set_nlink(direntry->d_inode, 2);
1366 1371
1367 mode &= ~current_umask(); 1372 mode &= ~current_umask();
1368 /* must turn on setgid bit if parent dir has it */ 1373 /* must turn on setgid bit if parent dir has it */
@@ -2096,6 +2101,8 @@ static int
2096cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) 2101cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2097{ 2102{
2098 int xid; 2103 int xid;
2104 uid_t uid = NO_CHANGE_32;
2105 gid_t gid = NO_CHANGE_32;
2099 struct inode *inode = direntry->d_inode; 2106 struct inode *inode = direntry->d_inode;
2100 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2107 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2101 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 2108 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2146 goto cifs_setattr_exit; 2153 goto cifs_setattr_exit;
2147 } 2154 }
2148 2155
2149 /* 2156 if (attrs->ia_valid & ATTR_UID)
2150 * Without unix extensions we can't send ownership changes to the 2157 uid = attrs->ia_uid;
2151 * server, so silently ignore them. This is consistent with how 2158
2152 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With 2159 if (attrs->ia_valid & ATTR_GID)
2153 * CIFSACL support + proper Windows to Unix idmapping, we may be 2160 gid = attrs->ia_gid;
2154 * able to support this in the future. 2161
2155 */ 2162#ifdef CONFIG_CIFS_ACL
2163 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2164 if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
2165 rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
2166 uid, gid);
2167 if (rc) {
2168 cFYI(1, "%s: Setting id failed with error: %d",
2169 __func__, rc);
2170 goto cifs_setattr_exit;
2171 }
2172 }
2173 } else
2174#endif /* CONFIG_CIFS_ACL */
2156 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) 2175 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
2157 attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); 2176 attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
2158 2177
@@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2161 attrs->ia_valid &= ~ATTR_MODE; 2180 attrs->ia_valid &= ~ATTR_MODE;
2162 2181
2163 if (attrs->ia_valid & ATTR_MODE) { 2182 if (attrs->ia_valid & ATTR_MODE) {
2164 cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
2165 mode = attrs->ia_mode; 2183 mode = attrs->ia_mode;
2166 }
2167
2168 if (attrs->ia_valid & ATTR_MODE) {
2169 rc = 0; 2184 rc = 0;
2170#ifdef CONFIG_CIFS_ACL 2185#ifdef CONFIG_CIFS_ACL
2171 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 2186 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2172 rc = mode_to_cifs_acl(inode, full_path, mode); 2187 rc = id_mode_to_cifs_acl(inode, full_path, mode,
2188 NO_CHANGE_32, NO_CHANGE_32);
2173 if (rc) { 2189 if (rc) {
2174 cFYI(1, "%s: Setting ACL failed with error: %d", 2190 cFYI(1, "%s: Setting ACL failed with error: %d",
2175 __func__, rc); 2191 __func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf02..6b0e0643439 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
183static int 183static int
184CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, 184CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
185 const char *fromName, const char *toName, 185 const char *fromName, const char *toName,
186 const struct nls_table *nls_codepage, int remap) 186 struct cifs_sb_info *cifs_sb)
187{ 187{
188 int rc; 188 int rc;
189 int oplock = 0; 189 int oplock = 0;
190 int remap;
191 int create_options = CREATE_NOT_DIR;
190 __u16 netfid = 0; 192 __u16 netfid = 0;
191 u8 *buf; 193 u8 *buf;
192 unsigned int bytes_written = 0; 194 unsigned int bytes_written = 0;
193 struct cifs_io_parms io_parms; 195 struct cifs_io_parms io_parms;
196 struct nls_table *nls_codepage;
197
198 nls_codepage = cifs_sb->local_nls;
199 remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
194 200
195 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); 201 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
196 if (!buf) 202 if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
202 return rc; 208 return rc;
203 } 209 }
204 210
211 if (backup_cred(cifs_sb))
212 create_options |= CREATE_OPEN_BACKUP_INTENT;
213
205 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, 214 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
206 CREATE_NOT_DIR, &netfid, &oplock, NULL, 215 create_options, &netfid, &oplock, NULL,
207 nls_codepage, remap); 216 nls_codepage, remap);
208 if (rc != 0) { 217 if (rc != 0) {
209 kfree(buf); 218 kfree(buf);
@@ -424,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
424 if (old_file->d_inode) { 433 if (old_file->d_inode) {
425 cifsInode = CIFS_I(old_file->d_inode); 434 cifsInode = CIFS_I(old_file->d_inode);
426 if (rc == 0) { 435 if (rc == 0) {
427 old_file->d_inode->i_nlink++; 436 inc_nlink(old_file->d_inode);
428/* BB should we make this contingent on superblock flag NOATIME? */ 437/* BB should we make this contingent on superblock flag NOATIME? */
429/* old_file->d_inode->i_ctime = CURRENT_TIME;*/ 438/* old_file->d_inode->i_ctime = CURRENT_TIME;*/
430 /* parent dir timestamps will update from srv 439 /* parent dir timestamps will update from srv
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
559 /* BB what if DFS and this volume is on different share? BB */ 568 /* BB what if DFS and this volume is on different share? BB */
560 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 569 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
561 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, 570 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
562 cifs_sb->local_nls, 571 cifs_sb);
563 cifs_sb->mnt_cifs_flags &
564 CIFS_MOUNT_MAP_SPECIAL_CHR);
565 else if (pTcon->unix_ext) 572 else if (pTcon->unix_ext)
566 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 573 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
567 cifs_sb->local_nls); 574 cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c169339259..703ef5c6fdb 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
420} 420}
421 421
422int 422int
423checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) 423checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
424{ 424{
425 __u32 len = be32_to_cpu(smb->smb_buf_length); 425 __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
426 __u32 clc_len; /* calculated length */ 426 __u32 clc_len; /* calculated length */
427 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); 427 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
428 total_read, rfclen);
428 429
429 if (length < 2 + sizeof(struct smb_hdr)) { 430 /* is this frame too small to even get to a BCC? */
430 if ((length >= sizeof(struct smb_hdr) - 1) 431 if (total_read < 2 + sizeof(struct smb_hdr)) {
432 if ((total_read >= sizeof(struct smb_hdr) - 1)
431 && (smb->Status.CifsError != 0)) { 433 && (smb->Status.CifsError != 0)) {
434 /* it's an error return */
432 smb->WordCount = 0; 435 smb->WordCount = 0;
433 /* some error cases do not return wct and bcc */ 436 /* some error cases do not return wct and bcc */
434 return 0; 437 return 0;
435 } else if ((length == sizeof(struct smb_hdr) + 1) && 438 } else if ((total_read == sizeof(struct smb_hdr) + 1) &&
436 (smb->WordCount == 0)) { 439 (smb->WordCount == 0)) {
437 char *tmp = (char *)smb; 440 char *tmp = (char *)smb;
438 /* Need to work around a bug in two servers here */ 441 /* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
452 } else { 455 } else {
453 cERROR(1, "Length less than smb header size"); 456 cERROR(1, "Length less than smb header size");
454 } 457 }
455 return 1; 458 return -EIO;
456 }
457 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
458 cERROR(1, "smb length greater than MaxBufSize, mid=%d",
459 smb->Mid);
460 return 1;
461 } 459 }
462 460
461 /* otherwise, there is enough to get to the BCC */
463 if (check_smb_hdr(smb, mid)) 462 if (check_smb_hdr(smb, mid))
464 return 1; 463 return -EIO;
465 clc_len = smbCalcSize(smb); 464 clc_len = smbCalcSize(smb);
466 465
467 if (4 + len != length) { 466 if (4 + rfclen != total_read) {
468 cERROR(1, "Length read does not match RFC1001 length %d", 467 cERROR(1, "Length read does not match RFC1001 length %d",
469 len); 468 rfclen);
470 return 1; 469 return -EIO;
471 } 470 }
472 471
473 if (4 + len != clc_len) { 472 if (4 + rfclen != clc_len) {
474 /* check if bcc wrapped around for large read responses */ 473 /* check if bcc wrapped around for large read responses */
475 if ((len > 64 * 1024) && (len > clc_len)) { 474 if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
476 /* check if lengths match mod 64K */ 475 /* check if lengths match mod 64K */
477 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 476 if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
478 return 0; /* bcc wrapped */ 477 return 0; /* bcc wrapped */
479 } 478 }
480 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", 479 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
481 clc_len, 4 + len, smb->Mid); 480 clc_len, 4 + rfclen, smb->Mid);
482 481
483 if (4 + len < clc_len) { 482 if (4 + rfclen < clc_len) {
484 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", 483 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
485 len, smb->Mid); 484 rfclen, smb->Mid);
486 return 1; 485 return -EIO;
487 } else if (len > clc_len + 512) { 486 } else if (rfclen > clc_len + 512) {
488 /* 487 /*
489 * Some servers (Windows XP in particular) send more 488 * Some servers (Windows XP in particular) send more
490 * data than the lengths in the SMB packet would 489 * data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
495 * data to 512 bytes. 494 * data to 512 bytes.
496 */ 495 */
497 cERROR(1, "RFC1001 size %u more than 512 bytes larger " 496 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
498 "than SMB for mid=%u", len, smb->Mid); 497 "than SMB for mid=%u", rfclen, smb->Mid);
499 return 1; 498 return -EIO;
500 } 499 }
501 } 500 }
502 return 0; 501 return 0;
@@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
676 cinode->clientCanCacheRead = false; 675 cinode->clientCanCacheRead = false;
677 } 676 }
678} 677}
678
679bool
680backup_cred(struct cifs_sb_info *cifs_sb)
681{
682 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
683 if (cifs_sb->mnt_backupuid == current_fsuid())
684 return true;
685 }
686 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
687 if (in_group_p(cifs_sb->mnt_backupgid))
688 return true;
689 }
690
691 return false;
692}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee..4ec3ee9d72c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
124 /* that we use in next few lines */ 124 /* that we use in next few lines */
125 /* Note that header is initialized to zero in header_assemble */ 125 /* Note that header is initialized to zero in header_assemble */
126 pSMB->req.AndXCommand = 0xFF; 126 pSMB->req.AndXCommand = 0xFF;
127 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); 127 pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
128 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
129 USHRT_MAX));
128 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 130 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
129 pSMB->req.VcNumber = get_next_vcnum(ses); 131 pSMB->req.VcNumber = get_next_vcnum(ses);
130 132
@@ -681,7 +683,7 @@ ssetup_ntlmssp_authenticate:
681 cpu_to_le16(CIFS_AUTH_RESP_SIZE); 683 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
682 684
683 /* calculate ntlm response and session key */ 685 /* calculate ntlm response and session key */
684 rc = setup_ntlm_response(ses); 686 rc = setup_ntlm_response(ses, nls_cp);
685 if (rc) { 687 if (rc) {
686 cERROR(1, "Error %d during NTLM authentication", rc); 688 cERROR(1, "Error %d during NTLM authentication", rc);
687 goto ssetup_exit; 689 goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff4875..7cacba12b8f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,160 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
199 return rc; 199 return rc;
200} 200}
201 201
202/* Routines for Windows NT MD4 Hash functions. */
203static int
204_my_wcslen(__u16 *str)
205{
206 int len = 0;
207 while (*str++ != 0)
208 len++;
209 return len;
210}
211
212/*
213 * Convert a string into an NT UNICODE string.
214 * Note that regardless of processor type
215 * this must be in intel (little-endian)
216 * format.
217 */
218
219static int
220_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
221{ /* BB not a very good conversion routine - change/fix */
222 int i;
223 __u16 val;
224
225 for (i = 0; i < len; i++) {
226 val = *src;
227 SSVAL(dst, 0, val);
228 dst++;
229 src++;
230 if (val == 0)
231 break;
232 }
233 return i;
234}
235
236/* 202/*
237 * Creates the MD4 Hash of the users password in NT UNICODE. 203 * Creates the MD4 Hash of the users password in NT UNICODE.
238 */ 204 */
239 205
240int 206int
241E_md4hash(const unsigned char *passwd, unsigned char *p16) 207E_md4hash(const unsigned char *passwd, unsigned char *p16,
208 const struct nls_table *codepage)
242{ 209{
243 int rc; 210 int rc;
244 int len; 211 int len;
245 __u16 wpwd[129]; 212 __u16 wpwd[129];
246 213
247 /* Password cannot be longer than 128 characters */ 214 /* Password cannot be longer than 128 characters */
248 if (passwd) { 215 if (passwd) /* Password must be converted to NT unicode */
249 len = strlen((char *) passwd); 216 len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
250 if (len > 128) 217 else {
251 len = 128;
252
253 /* Password must be converted to NT unicode */
254 _my_mbstowcs(wpwd, passwd, len);
255 } else
256 len = 0; 218 len = 0;
219 *wpwd = 0; /* Ensure string is null terminated */
220 }
257 221
258 wpwd[len] = 0; /* Ensure string is null terminated */ 222 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16));
259 /* Calculate length in bytes */ 223 memset(wpwd, 0, 129 * sizeof(__u16));
260 len = _my_wcslen(wpwd) * sizeof(__u16);
261
262 rc = mdfour(p16, (unsigned char *) wpwd, len);
263 memset(wpwd, 0, 129 * 2);
264 224
265 return rc; 225 return rc;
266} 226}
267 227
268#if 0 /* currently unused */
269/* Does both the NT and LM owfs of a user's password */
270static void
271nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
272{
273 char passwd[514];
274
275 memset(passwd, '\0', 514);
276 if (strlen(pwd) < 513)
277 strcpy(passwd, pwd);
278 else
279 memcpy(passwd, pwd, 512);
280 /* Calculate the MD4 hash (NT compatible) of the password */
281 memset(nt_p16, '\0', 16);
282 E_md4hash(passwd, nt_p16);
283
284 /* Mangle the passwords into Lanman format */
285 passwd[14] = '\0';
286/* strupper(passwd); */
287
288 /* Calculate the SMB (lanman) hash functions of the password */
289
290 memset(p16, '\0', 16);
291 E_P16((unsigned char *) passwd, (unsigned char *) p16);
292
293 /* clear out local copy of user's password (just being paranoid). */
294 memset(passwd, '\0', sizeof(passwd));
295}
296#endif
297
298/* Does the NTLMv2 owfs of a user's password */
299#if 0 /* function not needed yet - but will be soon */
300static void
301ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
302 const char *domain_n, unsigned char kr_buf[16],
303 const struct nls_table *nls_codepage)
304{
305 wchar_t *user_u;
306 wchar_t *dom_u;
307 int user_l, domain_l;
308 struct HMACMD5Context ctx;
309
310 /* might as well do one alloc to hold both (user_u and dom_u) */
311 user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
312 if (user_u == NULL)
313 return;
314 dom_u = user_u + 1024;
315
316 /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
317 STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
318 push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
319 STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
320
321 /* BB user and domain may need to be uppercased */
322 user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
323 domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
324
325 user_l++; /* trailing null */
326 domain_l++;
327
328 hmac_md5_init_limK_to_64(owf, 16, &ctx);
329 hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
330 hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
331 hmac_md5_final(kr_buf, &ctx);
332
333 kfree(user_u);
334}
335#endif
336
337/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
338#if 0 /* currently unused */
339static void
340NTLMSSPOWFencrypt(unsigned char passwd[8],
341 unsigned char *ntlmchalresp, unsigned char p24[24])
342{
343 unsigned char p21[21];
344
345 memset(p21, '\0', 21);
346 memcpy(p21, passwd, 8);
347 memset(p21 + 8, 0xbd, 8);
348
349 E_P24(p21, ntlmchalresp, p24);
350}
351#endif
352
353/* Does the NT MD4 hash then des encryption. */ 228/* Does the NT MD4 hash then des encryption. */
354int 229int
355SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 230SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
231 const struct nls_table *codepage)
356{ 232{
357 int rc; 233 int rc;
358 unsigned char p16[16], p21[21]; 234 unsigned char p16[16], p21[21];
@@ -360,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
360 memset(p16, '\0', 16); 236 memset(p16, '\0', 16);
361 memset(p21, '\0', 21); 237 memset(p21, '\0', 21);
362 238
363 rc = E_md4hash(passwd, p16); 239 rc = E_md4hash(passwd, p16, codepage);
364 if (rc) { 240 if (rc) {
365 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); 241 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
366 return rc; 242 return rc;
@@ -369,39 +245,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
369 rc = E_P24(p21, c8, p24); 245 rc = E_P24(p21, c8, p24);
370 return rc; 246 return rc;
371} 247}
372
373
374/* Does the md5 encryption from the NT hash for NTLMv2. */
375/* These routines will be needed later */
376#if 0
377static void
378SMBOWFencrypt_ntv2(const unsigned char kr[16],
379 const struct data_blob *srv_chal,
380 const struct data_blob *cli_chal, unsigned char resp_buf[16])
381{
382 struct HMACMD5Context ctx;
383
384 hmac_md5_init_limK_to_64(kr, 16, &ctx);
385 hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
386 hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
387 hmac_md5_final(resp_buf, &ctx);
388}
389
390static void
391SMBsesskeygen_ntv2(const unsigned char kr[16],
392 const unsigned char *nt_resp, __u8 sess_key[16])
393{
394 struct HMACMD5Context ctx;
395
396 hmac_md5_init_limK_to_64(kr, 16, &ctx);
397 hmac_md5_update(nt_resp, 16, &ctx);
398 hmac_md5_final((unsigned char *) sess_key, &ctx);
399}
400
401static void
402SMBsesskeygen_ntv1(const unsigned char kr[16],
403 const unsigned char *nt_resp, __u8 sess_key[16])
404{
405 mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
406}
407#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 10ca6b2c26b..0cc9584f588 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
26#include <linux/wait.h> 26#include <linux/wait.h>
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/freezer.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/processor.h> 31#include <asm/processor.h>
31#include <linux/mempool.h> 32#include <linux/mempool.h>
@@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
324{ 325{
325 int error; 326 int error;
326 327
327 error = wait_event_killable(server->response_q, 328 error = wait_event_freezekillable(server->response_q,
328 midQ->midState != MID_REQUEST_SUBMITTED); 329 midQ->midState != MID_REQUEST_SUBMITTED);
329 if (error < 0) 330 if (error < 0)
330 return -ERESTARTSYS; 331 return -ERESTARTSYS;
@@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
339 */ 340 */
340int 341int
341cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 342cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
342 unsigned int nvec, mid_callback_t *callback, void *cbdata, 343 unsigned int nvec, mid_receive_t *receive,
343 bool ignore_pend) 344 mid_callback_t *callback, void *cbdata, bool ignore_pend)
344{ 345{
345 int rc; 346 int rc;
346 struct mid_q_entry *mid; 347 struct mid_q_entry *mid;
@@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
374 goto out_err; 375 goto out_err;
375 } 376 }
376 377
378 mid->receive = receive;
377 mid->callback = callback; 379 mid->callback = callback;
378 mid->callback_data = cbdata; 380 mid->callback_data = cbdata;
379 mid->midState = MID_REQUEST_SUBMITTED; 381 mid->midState = MID_REQUEST_SUBMITTED;
@@ -496,13 +498,18 @@ int
496cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, 498cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
497 bool log_error) 499 bool log_error)
498{ 500{
499 dump_smb(mid->resp_buf, 501 unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
500 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); 502
503 dump_smb(mid->resp_buf, min_t(u32, 92, len));
501 504
502 /* convert the length into a more usable form */ 505 /* convert the length into a more usable form */
503 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 506 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
507 struct kvec iov;
508
509 iov.iov_base = mid->resp_buf;
510 iov.iov_len = len;
504 /* FIXME: add code to kill session */ 511 /* FIXME: add code to kill session */
505 if (cifs_verify_signature(mid->resp_buf, server, 512 if (cifs_verify_signature(&iov, 1, server,
506 mid->sequence_number + 1) != 0) 513 mid->sequence_number + 1) != 0)
507 cERROR(1, "Unexpected SMB signature"); 514 cERROR(1, "Unexpected SMB signature");
508 } 515 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index c3230888214..45f07c46f3e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -173,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
173#ifdef CONFIG_CIFS_ACL 173#ifdef CONFIG_CIFS_ACL
174 memcpy(pacl, ea_value, value_size); 174 memcpy(pacl, ea_value, value_size);
175 rc = set_cifs_acl(pacl, value_size, 175 rc = set_cifs_acl(pacl, value_size,
176 direntry->d_inode, full_path); 176 direntry->d_inode, full_path, CIFS_ACL_DACL);
177 if (rc == 0) /* force revalidate of the inode */ 177 if (rc == 0) /* force revalidate of the inode */
178 CIFS_I(direntry->d_inode)->time = 0; 178 CIFS_I(direntry->d_inode)->time = 0;
179 kfree(pacl); 179 kfree(pacl);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2bdbcc11b37..854ace71268 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
104 if (attr->va_gid != -1) 104 if (attr->va_gid != -1)
105 inode->i_gid = (gid_t) attr->va_gid; 105 inode->i_gid = (gid_t) attr->va_gid;
106 if (attr->va_nlink != -1) 106 if (attr->va_nlink != -1)
107 inode->i_nlink = attr->va_nlink; 107 set_nlink(inode, attr->va_nlink);
108 if (attr->va_size != -1) 108 if (attr->va_size != -1)
109 inode->i_size = attr->va_size; 109 inode->i_size = attr->va_size;
110 if (attr->va_size != -1) 110 if (attr->va_size != -1)
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0239433f50c..28e7e135cfa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
340 if (!error) { 340 if (!error) {
341 /* VFS may delete the child */ 341 /* VFS may delete the child */
342 if (de->d_inode) 342 if (de->d_inode)
343 de->d_inode->i_nlink = 0; 343 clear_nlink(de->d_inode);
344 344
345 /* fix the link count of the parent */ 345 /* fix the link count of the parent */
346 coda_dir_drop_nlink(dir); 346 coda_dir_drop_nlink(dir);
diff --git a/fs/compat.c b/fs/compat.c
index 58b1da45989..c98787536bb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
37#include <linux/dirent.h> 37#include <linux/dirent.h>
38#include <linux/fsnotify.h> 38#include <linux/fsnotify.h>
39#include <linux/highuid.h> 39#include <linux/highuid.h>
40#include <linux/nfsd/syscall.h>
41#include <linux/personality.h> 40#include <linux/personality.h>
42#include <linux/rwsem.h> 41#include <linux/rwsem.h>
43#include <linux/tsacct_kern.h> 42#include <linux/tsacct_kern.h>
@@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
247 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 246 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
248 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 247 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
249 __put_user(kbuf->f_frsize, &ubuf->f_frsize) || 248 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
250 __put_user(0, &ubuf->f_spare[0]) || 249 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
251 __put_user(0, &ubuf->f_spare[1]) || 250 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
252 __put_user(0, &ubuf->f_spare[2]) ||
253 __put_user(0, &ubuf->f_spare[3]) ||
254 __put_user(0, &ubuf->f_spare[4]))
255 return -EFAULT; 251 return -EFAULT;
256 return 0; 252 return 0;
257} 253}
@@ -550,7 +546,7 @@ out:
550ssize_t compat_rw_copy_check_uvector(int type, 546ssize_t compat_rw_copy_check_uvector(int type,
551 const struct compat_iovec __user *uvector, unsigned long nr_segs, 547 const struct compat_iovec __user *uvector, unsigned long nr_segs,
552 unsigned long fast_segs, struct iovec *fast_pointer, 548 unsigned long fast_segs, struct iovec *fast_pointer,
553 struct iovec **ret_pointer) 549 struct iovec **ret_pointer, int check_access)
554{ 550{
555 compat_ssize_t tot_len; 551 compat_ssize_t tot_len;
556 struct iovec *iov = *ret_pointer = fast_pointer; 552 struct iovec *iov = *ret_pointer = fast_pointer;
@@ -597,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
597 } 593 }
598 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 594 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
599 goto out; 595 goto out;
600 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 596 if (check_access &&
597 !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
601 ret = -EFAULT; 598 ret = -EFAULT;
602 goto out; 599 goto out;
603 } 600 }
@@ -1111,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1111 goto out; 1108 goto out;
1112 1109
1113 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, 1110 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1114 UIO_FASTIOV, iovstack, &iov); 1111 UIO_FASTIOV, iovstack, &iov, 1);
1115 if (tot_len == 0) { 1112 if (tot_len == 0) {
1116 ret = 0; 1113 ret = 0;
1117 goto out; 1114 goto out;
diff --git a/fs/dcache.c b/fs/dcache.c
index a88948b8bd1..a901c6901bc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -225,7 +225,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
225} 225}
226 226
227/* 227/*
228 * dentry_lru_(add|del|move_tail) must be called with d_lock held. 228 * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
229 */ 229 */
230static void dentry_lru_add(struct dentry *dentry) 230static void dentry_lru_add(struct dentry *dentry)
231{ 231{
@@ -245,6 +245,9 @@ static void __dentry_lru_del(struct dentry *dentry)
245 dentry_stat.nr_unused--; 245 dentry_stat.nr_unused--;
246} 246}
247 247
248/*
249 * Remove a dentry with references from the LRU.
250 */
248static void dentry_lru_del(struct dentry *dentry) 251static void dentry_lru_del(struct dentry *dentry)
249{ 252{
250 if (!list_empty(&dentry->d_lru)) { 253 if (!list_empty(&dentry->d_lru)) {
@@ -254,6 +257,23 @@ static void dentry_lru_del(struct dentry *dentry)
254 } 257 }
255} 258}
256 259
260/*
261 * Remove a dentry that is unreferenced and about to be pruned
262 * (unhashed and destroyed) from the LRU, and inform the file system.
263 * This wrapper should be called _prior_ to unhashing a victim dentry.
264 */
265static void dentry_lru_prune(struct dentry *dentry)
266{
267 if (!list_empty(&dentry->d_lru)) {
268 if (dentry->d_flags & DCACHE_OP_PRUNE)
269 dentry->d_op->d_prune(dentry);
270
271 spin_lock(&dcache_lru_lock);
272 __dentry_lru_del(dentry);
273 spin_unlock(&dcache_lru_lock);
274 }
275}
276
257static void dentry_lru_move_tail(struct dentry *dentry) 277static void dentry_lru_move_tail(struct dentry *dentry)
258{ 278{
259 spin_lock(&dcache_lru_lock); 279 spin_lock(&dcache_lru_lock);
@@ -403,8 +423,12 @@ relock:
403 423
404 if (ref) 424 if (ref)
405 dentry->d_count--; 425 dentry->d_count--;
406 /* if dentry was on the d_lru list delete it from there */ 426 /*
407 dentry_lru_del(dentry); 427 * if dentry was on the d_lru list delete it from there.
428 * inform the fs via d_prune that this dentry is about to be
429 * unhashed and destroyed.
430 */
431 dentry_lru_prune(dentry);
408 /* if it was on the hash then remove it */ 432 /* if it was on the hash then remove it */
409 __d_drop(dentry); 433 __d_drop(dentry);
410 return d_kill(dentry, parent); 434 return d_kill(dentry, parent);
@@ -522,9 +546,11 @@ int d_invalidate(struct dentry * dentry)
522 * would make it unreachable from the root, 546 * would make it unreachable from the root,
523 * we might still populate it if it was a 547 * we might still populate it if it was a
524 * working directory or similar). 548 * working directory or similar).
549 * We also need to leave mountpoints alone,
550 * directory or not.
525 */ 551 */
526 if (dentry->d_count > 1) { 552 if (dentry->d_count > 1 && dentry->d_inode) {
527 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { 553 if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
528 spin_unlock(&dentry->d_lock); 554 spin_unlock(&dentry->d_lock);
529 return -EBUSY; 555 return -EBUSY;
530 } 556 }
@@ -854,8 +880,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
854 do { 880 do {
855 struct inode *inode; 881 struct inode *inode;
856 882
857 /* detach from the system */ 883 /*
858 dentry_lru_del(dentry); 884 * remove the dentry from the lru, and inform
885 * the fs that this dentry is about to be
886 * unhashed and destroyed.
887 */
888 dentry_lru_prune(dentry);
859 __d_shrink(dentry); 889 __d_shrink(dentry);
860 890
861 if (dentry->d_count != 0) { 891 if (dentry->d_count != 0) {
@@ -1283,6 +1313,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
1283 dentry->d_flags |= DCACHE_OP_REVALIDATE; 1313 dentry->d_flags |= DCACHE_OP_REVALIDATE;
1284 if (op->d_delete) 1314 if (op->d_delete)
1285 dentry->d_flags |= DCACHE_OP_DELETE; 1315 dentry->d_flags |= DCACHE_OP_DELETE;
1316 if (op->d_prune)
1317 dentry->d_flags |= DCACHE_OP_PRUNE;
1286 1318
1287} 1319}
1288EXPORT_SYMBOL(d_set_d_op); 1320EXPORT_SYMBOL(d_set_d_op);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f0732..f3a257d7a98 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * file.c - part of debugfs, a tiny little debug file system 2 * inode.c - part of debugfs, a tiny little debug file system
3 * 3 *
4 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> 4 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
5 * Copyright (C) 2004 IBM Inc. 5 * Copyright (C) 2004 IBM Inc.
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2f27e578d46..d5d5297efe9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
307 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 307 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
308 inode->i_op = &simple_dir_inode_operations; 308 inode->i_op = &simple_dir_inode_operations;
309 inode->i_fop = &simple_dir_operations; 309 inode->i_fop = &simple_dir_operations;
310 inode->i_nlink = 2; 310 set_nlink(inode, 2);
311 311
312 s->s_root = d_alloc_root(inode); 312 s->s_root = d_alloc_root(inode);
313 if (s->s_root) 313 if (s->s_root)
@@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty)
549 549
550 dentry = d_find_alias(inode); 550 dentry = d_find_alias(inode);
551 551
552 inode->i_nlink--; 552 drop_nlink(inode);
553 d_delete(dentry); 553 d_delete(dentry);
554 dput(dentry); /* d_alloc_name() in devpts_pty_new() */ 554 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
555 dput(dentry); /* d_find_alias above */ 555 dput(dentry); /* d_find_alias above */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca804..d740ab67ff6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
39 39
40/* 40/*
41 * How many user pages to map in one call to get_user_pages(). This determines 41 * How many user pages to map in one call to get_user_pages(). This determines
42 * the size of a structure on the stack. 42 * the size of a structure in the slab cache
43 */ 43 */
44#define DIO_PAGES 64 44#define DIO_PAGES 64
45 45
@@ -55,13 +55,10 @@
55 * blocksize. 55 * blocksize.
56 */ 56 */
57 57
58struct dio { 58/* dio_state only used in the submission path */
59 /* BIO submission state */ 59
60struct dio_submit {
60 struct bio *bio; /* bio under assembly */ 61 struct bio *bio; /* bio under assembly */
61 struct inode *inode;
62 int rw;
63 loff_t i_size; /* i_size when submitted */
64 int flags; /* doesn't change */
65 unsigned blkbits; /* doesn't change */ 62 unsigned blkbits; /* doesn't change */
66 unsigned blkfactor; /* When we're using an alignment which 63 unsigned blkfactor; /* When we're using an alignment which
67 is finer than the filesystem's soft 64 is finer than the filesystem's soft
@@ -76,18 +73,17 @@ struct dio {
76 sector_t block_in_file; /* Current offset into the underlying 73 sector_t block_in_file; /* Current offset into the underlying
77 file in dio_block units. */ 74 file in dio_block units. */
78 unsigned blocks_available; /* At block_in_file. changes */ 75 unsigned blocks_available; /* At block_in_file. changes */
76 int reap_counter; /* rate limit reaping */
79 sector_t final_block_in_request;/* doesn't change */ 77 sector_t final_block_in_request;/* doesn't change */
80 unsigned first_block_in_page; /* doesn't change, Used only once */ 78 unsigned first_block_in_page; /* doesn't change, Used only once */
81 int boundary; /* prev block is at a boundary */ 79 int boundary; /* prev block is at a boundary */
82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 80 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */ 81 dio_submit_t *submit_io; /* IO submition function */
82
86 loff_t logical_offset_in_bio; /* current first logical block in bio */ 83 loff_t logical_offset_in_bio; /* current first logical block in bio */
87 sector_t final_block_in_bio; /* current final block in bio + 1 */ 84 sector_t final_block_in_bio; /* current final block in bio + 1 */
88 sector_t next_block_for_io; /* next block to be put under IO, 85 sector_t next_block_for_io; /* next block to be put under IO,
89 in dio_blocks units */ 86 in dio_blocks units */
90 struct buffer_head map_bh; /* last get_block() result */
91 87
92 /* 88 /*
93 * Deferred addition of a page to the dio. These variables are 89 * Deferred addition of a page to the dio. These variables are
@@ -100,18 +96,6 @@ struct dio {
100 sector_t cur_page_block; /* Where it starts */ 96 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */ 97 loff_t cur_page_fs_offset; /* Offset in file */
102 98
103 /* BIO completion state */
104 spinlock_t bio_lock; /* protects BIO fields below */
105 unsigned long refcount; /* direct_io_worker() and bios */
106 struct bio *bio_list; /* singly linked via bi_private */
107 struct task_struct *waiter; /* waiting task (NULL if none) */
108
109 /* AIO related stuff */
110 struct kiocb *iocb; /* kiocb */
111 int is_async; /* is IO async ? */
112 int io_error; /* IO error in completion path */
113 ssize_t result; /* IO result */
114
115 /* 99 /*
116 * Page fetching state. These variables belong to dio_refill_pages(). 100 * Page fetching state. These variables belong to dio_refill_pages().
117 */ 101 */
@@ -125,7 +109,30 @@ struct dio {
125 */ 109 */
126 unsigned head; /* next page to process */ 110 unsigned head; /* next page to process */
127 unsigned tail; /* last valid page + 1 */ 111 unsigned tail; /* last valid page + 1 */
112};
113
114/* dio_state communicated between submission path and end_io */
115struct dio {
116 int flags; /* doesn't change */
117 int rw;
118 struct inode *inode;
119 loff_t i_size; /* i_size when submitted */
120 dio_iodone_t *end_io; /* IO completion function */
121
122 void *private; /* copy from map_bh.b_private */
123
124 /* BIO completion state */
125 spinlock_t bio_lock; /* protects BIO fields below */
128 int page_errors; /* errno from get_user_pages() */ 126 int page_errors; /* errno from get_user_pages() */
127 int is_async; /* is IO async ? */
128 int io_error; /* IO error in completion path */
129 unsigned long refcount; /* direct_io_worker() and bios */
130 struct bio *bio_list; /* singly linked via bi_private */
131 struct task_struct *waiter; /* waiting task (NULL if none) */
132
133 /* AIO related stuff */
134 struct kiocb *iocb; /* kiocb */
135 ssize_t result; /* IO result */
129 136
130 /* 137 /*
131 * pages[] (and any fields placed after it) are not zeroed out at 138 * pages[] (and any fields placed after it) are not zeroed out at
@@ -133,7 +140,9 @@ struct dio {
133 * wish that they not be zeroed. 140 * wish that they not be zeroed.
134 */ 141 */
135 struct page *pages[DIO_PAGES]; /* page buffer */ 142 struct page *pages[DIO_PAGES]; /* page buffer */
136}; 143} ____cacheline_aligned_in_smp;
144
145static struct kmem_cache *dio_cache __read_mostly;
137 146
138static void __inode_dio_wait(struct inode *inode) 147static void __inode_dio_wait(struct inode *inode)
139{ 148{
@@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
182/* 191/*
183 * How many pages are in the queue? 192 * How many pages are in the queue?
184 */ 193 */
185static inline unsigned dio_pages_present(struct dio *dio) 194static inline unsigned dio_pages_present(struct dio_submit *sdio)
186{ 195{
187 return dio->tail - dio->head; 196 return sdio->tail - sdio->head;
188} 197}
189 198
190/* 199/*
191 * Go grab and pin some userspace pages. Typically we'll get 64 at a time. 200 * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
192 */ 201 */
193static int dio_refill_pages(struct dio *dio) 202static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
194{ 203{
195 int ret; 204 int ret;
196 int nr_pages; 205 int nr_pages;
197 206
198 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); 207 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
199 ret = get_user_pages_fast( 208 ret = get_user_pages_fast(
200 dio->curr_user_address, /* Where from? */ 209 sdio->curr_user_address, /* Where from? */
201 nr_pages, /* How many pages? */ 210 nr_pages, /* How many pages? */
202 dio->rw == READ, /* Write to memory? */ 211 dio->rw == READ, /* Write to memory? */
203 &dio->pages[0]); /* Put results here */ 212 &dio->pages[0]); /* Put results here */
204 213
205 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { 214 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
206 struct page *page = ZERO_PAGE(0); 215 struct page *page = ZERO_PAGE(0);
207 /* 216 /*
208 * A memory fault, but the filesystem has some outstanding 217 * A memory fault, but the filesystem has some outstanding
@@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio)
213 dio->page_errors = ret; 222 dio->page_errors = ret;
214 page_cache_get(page); 223 page_cache_get(page);
215 dio->pages[0] = page; 224 dio->pages[0] = page;
216 dio->head = 0; 225 sdio->head = 0;
217 dio->tail = 1; 226 sdio->tail = 1;
218 ret = 0; 227 ret = 0;
219 goto out; 228 goto out;
220 } 229 }
221 230
222 if (ret >= 0) { 231 if (ret >= 0) {
223 dio->curr_user_address += ret * PAGE_SIZE; 232 sdio->curr_user_address += ret * PAGE_SIZE;
224 dio->curr_page += ret; 233 sdio->curr_page += ret;
225 dio->head = 0; 234 sdio->head = 0;
226 dio->tail = ret; 235 sdio->tail = ret;
227 ret = 0; 236 ret = 0;
228 } 237 }
229out: 238out:
@@ -236,17 +245,18 @@ out:
236 * decent number of pages, less frequently. To provide nicer use of the 245 * decent number of pages, less frequently. To provide nicer use of the
237 * L1 cache. 246 * L1 cache.
238 */ 247 */
239static struct page *dio_get_page(struct dio *dio) 248static inline struct page *dio_get_page(struct dio *dio,
249 struct dio_submit *sdio)
240{ 250{
241 if (dio_pages_present(dio) == 0) { 251 if (dio_pages_present(sdio) == 0) {
242 int ret; 252 int ret;
243 253
244 ret = dio_refill_pages(dio); 254 ret = dio_refill_pages(dio, sdio);
245 if (ret) 255 if (ret)
246 return ERR_PTR(ret); 256 return ERR_PTR(ret);
247 BUG_ON(dio_pages_present(dio) == 0); 257 BUG_ON(dio_pages_present(sdio) == 0);
248 } 258 }
249 return dio->pages[dio->head++]; 259 return dio->pages[sdio->head++];
250} 260}
251 261
252/** 262/**
@@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
292 302
293 if (dio->end_io && dio->result) { 303 if (dio->end_io && dio->result) {
294 dio->end_io(dio->iocb, offset, transferred, 304 dio->end_io(dio->iocb, offset, transferred,
295 dio->map_bh.b_private, ret, is_async); 305 dio->private, ret, is_async);
296 } else { 306 } else {
297 if (is_async) 307 if (is_async)
298 aio_complete(dio->iocb, ret, 0); 308 aio_complete(dio->iocb, ret, 0);
@@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
323 333
324 if (remaining == 0) { 334 if (remaining == 0) {
325 dio_complete(dio, dio->iocb->ki_pos, 0, true); 335 dio_complete(dio, dio->iocb->ki_pos, 0, true);
326 kfree(dio); 336 kmem_cache_free(dio_cache, dio);
327 } 337 }
328} 338}
329 339
@@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error)
367} 377}
368EXPORT_SYMBOL_GPL(dio_end_io); 378EXPORT_SYMBOL_GPL(dio_end_io);
369 379
370static void 380static inline void
371dio_bio_alloc(struct dio *dio, struct block_device *bdev, 381dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
372 sector_t first_sector, int nr_vecs) 382 struct block_device *bdev,
383 sector_t first_sector, int nr_vecs)
373{ 384{
374 struct bio *bio; 385 struct bio *bio;
375 386
@@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
386 else 397 else
387 bio->bi_end_io = dio_bio_end_io; 398 bio->bi_end_io = dio_bio_end_io;
388 399
389 dio->bio = bio; 400 sdio->bio = bio;
390 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 401 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
391} 402}
392 403
393/* 404/*
@@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
397 * 408 *
398 * bios hold a dio reference between submit_bio and ->end_io. 409 * bios hold a dio reference between submit_bio and ->end_io.
399 */ 410 */
400static void dio_bio_submit(struct dio *dio) 411static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
401{ 412{
402 struct bio *bio = dio->bio; 413 struct bio *bio = sdio->bio;
403 unsigned long flags; 414 unsigned long flags;
404 415
405 bio->bi_private = dio; 416 bio->bi_private = dio;
@@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio)
411 if (dio->is_async && dio->rw == READ) 422 if (dio->is_async && dio->rw == READ)
412 bio_set_pages_dirty(bio); 423 bio_set_pages_dirty(bio);
413 424
414 if (dio->submit_io) 425 if (sdio->submit_io)
415 dio->submit_io(dio->rw, bio, dio->inode, 426 sdio->submit_io(dio->rw, bio, dio->inode,
416 dio->logical_offset_in_bio); 427 sdio->logical_offset_in_bio);
417 else 428 else
418 submit_bio(dio->rw, bio); 429 submit_bio(dio->rw, bio);
419 430
420 dio->bio = NULL; 431 sdio->bio = NULL;
421 dio->boundary = 0; 432 sdio->boundary = 0;
422 dio->logical_offset_in_bio = 0; 433 sdio->logical_offset_in_bio = 0;
423} 434}
424 435
425/* 436/*
426 * Release any resources in case of a failure 437 * Release any resources in case of a failure
427 */ 438 */
428static void dio_cleanup(struct dio *dio) 439static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
429{ 440{
430 while (dio_pages_present(dio)) 441 while (dio_pages_present(sdio))
431 page_cache_release(dio_get_page(dio)); 442 page_cache_release(dio_get_page(dio, sdio));
432} 443}
433 444
434/* 445/*
@@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio)
518 * 529 *
519 * This also helps to limit the peak amount of pinned userspace memory. 530 * This also helps to limit the peak amount of pinned userspace memory.
520 */ 531 */
521static int dio_bio_reap(struct dio *dio) 532static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
522{ 533{
523 int ret = 0; 534 int ret = 0;
524 535
525 if (dio->reap_counter++ >= 64) { 536 if (sdio->reap_counter++ >= 64) {
526 while (dio->bio_list) { 537 while (dio->bio_list) {
527 unsigned long flags; 538 unsigned long flags;
528 struct bio *bio; 539 struct bio *bio;
@@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio)
536 if (ret == 0) 547 if (ret == 0)
537 ret = ret2; 548 ret = ret2;
538 } 549 }
539 dio->reap_counter = 0; 550 sdio->reap_counter = 0;
540 } 551 }
541 return ret; 552 return ret;
542} 553}
543 554
544/* 555/*
545 * Call into the fs to map some more disk blocks. We record the current number 556 * Call into the fs to map some more disk blocks. We record the current number
546 * of available blocks at dio->blocks_available. These are in units of the 557 * of available blocks at sdio->blocks_available. These are in units of the
547 * fs blocksize, (1 << inode->i_blkbits). 558 * fs blocksize, (1 << inode->i_blkbits).
548 * 559 *
549 * The fs is allowed to map lots of blocks at once. If it wants to do that, 560 * The fs is allowed to map lots of blocks at once. If it wants to do that,
@@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio)
564 * buffer_mapped(). However the direct-io code will only process holes one 575 * buffer_mapped(). However the direct-io code will only process holes one
565 * block at a time - it will repeatedly call get_block() as it walks the hole. 576 * block at a time - it will repeatedly call get_block() as it walks the hole.
566 */ 577 */
567static int get_more_blocks(struct dio *dio) 578static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
579 struct buffer_head *map_bh)
568{ 580{
569 int ret; 581 int ret;
570 struct buffer_head *map_bh = &dio->map_bh;
571 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
572 unsigned long fs_count; /* Number of filesystem-sized blocks */ 583 unsigned long fs_count; /* Number of filesystem-sized blocks */
573 unsigned long dio_count;/* Number of dio_block-sized blocks */ 584 unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio)
580 */ 591 */
581 ret = dio->page_errors; 592 ret = dio->page_errors;
582 if (ret == 0) { 593 if (ret == 0) {
583 BUG_ON(dio->block_in_file >= dio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
584 fs_startblk = dio->block_in_file >> dio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
585 dio_count = dio->final_block_in_request - dio->block_in_file; 596 dio_count = sdio->final_block_in_request - sdio->block_in_file;
586 fs_count = dio_count >> dio->blkfactor; 597 fs_count = dio_count >> sdio->blkfactor;
587 blkmask = (1 << dio->blkfactor) - 1; 598 blkmask = (1 << sdio->blkfactor) - 1;
588 if (dio_count & blkmask) 599 if (dio_count & blkmask)
589 fs_count++; 600 fs_count++;
590 601
@@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio)
604 */ 615 */
605 create = dio->rw & WRITE; 616 create = dio->rw & WRITE;
606 if (dio->flags & DIO_SKIP_HOLES) { 617 if (dio->flags & DIO_SKIP_HOLES) {
607 if (dio->block_in_file < (i_size_read(dio->inode) >> 618 if (sdio->block_in_file < (i_size_read(dio->inode) >>
608 dio->blkbits)) 619 sdio->blkbits))
609 create = 0; 620 create = 0;
610 } 621 }
611 622
612 ret = (*dio->get_block)(dio->inode, fs_startblk, 623 ret = (*sdio->get_block)(dio->inode, fs_startblk,
613 map_bh, create); 624 map_bh, create);
625
626 /* Store for completion */
627 dio->private = map_bh->b_private;
614 } 628 }
615 return ret; 629 return ret;
616} 630}
@@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio)
618/* 632/*
619 * There is no bio. Make one now. 633 * There is no bio. Make one now.
620 */ 634 */
621static int dio_new_bio(struct dio *dio, sector_t start_sector) 635static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
636 sector_t start_sector, struct buffer_head *map_bh)
622{ 637{
623 sector_t sector; 638 sector_t sector;
624 int ret, nr_pages; 639 int ret, nr_pages;
625 640
626 ret = dio_bio_reap(dio); 641 ret = dio_bio_reap(dio, sdio);
627 if (ret) 642 if (ret)
628 goto out; 643 goto out;
629 sector = start_sector << (dio->blkbits - 9); 644 sector = start_sector << (sdio->blkbits - 9);
630 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 645 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
631 nr_pages = min(nr_pages, BIO_MAX_PAGES); 646 nr_pages = min(nr_pages, BIO_MAX_PAGES);
632 BUG_ON(nr_pages <= 0); 647 BUG_ON(nr_pages <= 0);
633 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 648 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
634 dio->boundary = 0; 649 sdio->boundary = 0;
635out: 650out:
636 return ret; 651 return ret;
637} 652}
@@ -643,21 +658,21 @@ out:
643 * 658 *
644 * Return zero on success. Non-zero means the caller needs to start a new BIO. 659 * Return zero on success. Non-zero means the caller needs to start a new BIO.
645 */ 660 */
646static int dio_bio_add_page(struct dio *dio) 661static inline int dio_bio_add_page(struct dio_submit *sdio)
647{ 662{
648 int ret; 663 int ret;
649 664
650 ret = bio_add_page(dio->bio, dio->cur_page, 665 ret = bio_add_page(sdio->bio, sdio->cur_page,
651 dio->cur_page_len, dio->cur_page_offset); 666 sdio->cur_page_len, sdio->cur_page_offset);
652 if (ret == dio->cur_page_len) { 667 if (ret == sdio->cur_page_len) {
653 /* 668 /*
654 * Decrement count only, if we are done with this page 669 * Decrement count only, if we are done with this page
655 */ 670 */
656 if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) 671 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
657 dio->pages_in_io--; 672 sdio->pages_in_io--;
658 page_cache_get(dio->cur_page); 673 page_cache_get(sdio->cur_page);
659 dio->final_block_in_bio = dio->cur_page_block + 674 sdio->final_block_in_bio = sdio->cur_page_block +
660 (dio->cur_page_len >> dio->blkbits); 675 (sdio->cur_page_len >> sdio->blkbits);
661 ret = 0; 676 ret = 0;
662 } else { 677 } else {
663 ret = 1; 678 ret = 1;
@@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio)
675 * The caller of this function is responsible for removing cur_page from the 690 * The caller of this function is responsible for removing cur_page from the
676 * dio, and for dropping the refcount which came from that presence. 691 * dio, and for dropping the refcount which came from that presence.
677 */ 692 */
678static int dio_send_cur_page(struct dio *dio) 693static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
694 struct buffer_head *map_bh)
679{ 695{
680 int ret = 0; 696 int ret = 0;
681 697
682 if (dio->bio) { 698 if (sdio->bio) {
683 loff_t cur_offset = dio->cur_page_fs_offset; 699 loff_t cur_offset = sdio->cur_page_fs_offset;
684 loff_t bio_next_offset = dio->logical_offset_in_bio + 700 loff_t bio_next_offset = sdio->logical_offset_in_bio +
685 dio->bio->bi_size; 701 sdio->bio->bi_size;
686 702
687 /* 703 /*
688 * See whether this new request is contiguous with the old. 704 * See whether this new request is contiguous with the old.
@@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio)
698 * be the next logical offset in the bio, submit the bio we 714 * be the next logical offset in the bio, submit the bio we
699 * have. 715 * have.
700 */ 716 */
701 if (dio->final_block_in_bio != dio->cur_page_block || 717 if (sdio->final_block_in_bio != sdio->cur_page_block ||
702 cur_offset != bio_next_offset) 718 cur_offset != bio_next_offset)
703 dio_bio_submit(dio); 719 dio_bio_submit(dio, sdio);
704 /* 720 /*
705 * Submit now if the underlying fs is about to perform a 721 * Submit now if the underlying fs is about to perform a
706 * metadata read 722 * metadata read
707 */ 723 */
708 else if (dio->boundary) 724 else if (sdio->boundary)
709 dio_bio_submit(dio); 725 dio_bio_submit(dio, sdio);
710 } 726 }
711 727
712 if (dio->bio == NULL) { 728 if (sdio->bio == NULL) {
713 ret = dio_new_bio(dio, dio->cur_page_block); 729 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
714 if (ret) 730 if (ret)
715 goto out; 731 goto out;
716 } 732 }
717 733
718 if (dio_bio_add_page(dio) != 0) { 734 if (dio_bio_add_page(sdio) != 0) {
719 dio_bio_submit(dio); 735 dio_bio_submit(dio, sdio);
720 ret = dio_new_bio(dio, dio->cur_page_block); 736 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
721 if (ret == 0) { 737 if (ret == 0) {
722 ret = dio_bio_add_page(dio); 738 ret = dio_bio_add_page(sdio);
723 BUG_ON(ret != 0); 739 BUG_ON(ret != 0);
724 } 740 }
725 } 741 }
@@ -744,9 +760,10 @@ out:
744 * If that doesn't work out then we put the old page into the bio and add this 760 * If that doesn't work out then we put the old page into the bio and add this
745 * page to the dio instead. 761 * page to the dio instead.
746 */ 762 */
747static int 763static inline int
748submit_page_section(struct dio *dio, struct page *page, 764submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
749 unsigned offset, unsigned len, sector_t blocknr) 765 unsigned offset, unsigned len, sector_t blocknr,
766 struct buffer_head *map_bh)
750{ 767{
751 int ret = 0; 768 int ret = 0;
752 769
@@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page,
760 /* 777 /*
761 * Can we just grow the current page's presence in the dio? 778 * Can we just grow the current page's presence in the dio?
762 */ 779 */
763 if ( (dio->cur_page == page) && 780 if (sdio->cur_page == page &&
764 (dio->cur_page_offset + dio->cur_page_len == offset) && 781 sdio->cur_page_offset + sdio->cur_page_len == offset &&
765 (dio->cur_page_block + 782 sdio->cur_page_block +
766 (dio->cur_page_len >> dio->blkbits) == blocknr)) { 783 (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
767 dio->cur_page_len += len; 784 sdio->cur_page_len += len;
768 785
769 /* 786 /*
770 * If dio->boundary then we want to schedule the IO now to 787 * If sdio->boundary then we want to schedule the IO now to
771 * avoid metadata seeks. 788 * avoid metadata seeks.
772 */ 789 */
773 if (dio->boundary) { 790 if (sdio->boundary) {
774 ret = dio_send_cur_page(dio); 791 ret = dio_send_cur_page(dio, sdio, map_bh);
775 page_cache_release(dio->cur_page); 792 page_cache_release(sdio->cur_page);
776 dio->cur_page = NULL; 793 sdio->cur_page = NULL;
777 } 794 }
778 goto out; 795 goto out;
779 } 796 }
@@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page,
781 /* 798 /*
782 * If there's a deferred page already there then send it. 799 * If there's a deferred page already there then send it.
783 */ 800 */
784 if (dio->cur_page) { 801 if (sdio->cur_page) {
785 ret = dio_send_cur_page(dio); 802 ret = dio_send_cur_page(dio, sdio, map_bh);
786 page_cache_release(dio->cur_page); 803 page_cache_release(sdio->cur_page);
787 dio->cur_page = NULL; 804 sdio->cur_page = NULL;
788 if (ret) 805 if (ret)
789 goto out; 806 goto out;
790 } 807 }
791 808
792 page_cache_get(page); /* It is in dio */ 809 page_cache_get(page); /* It is in dio */
793 dio->cur_page = page; 810 sdio->cur_page = page;
794 dio->cur_page_offset = offset; 811 sdio->cur_page_offset = offset;
795 dio->cur_page_len = len; 812 sdio->cur_page_len = len;
796 dio->cur_page_block = blocknr; 813 sdio->cur_page_block = blocknr;
797 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; 814 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
798out: 815out:
799 return ret; 816 return ret;
800} 817}
@@ -804,16 +821,16 @@ out:
804 * file blocks. Only called for S_ISREG files - blockdevs do not set 821 * file blocks. Only called for S_ISREG files - blockdevs do not set
805 * buffer_new 822 * buffer_new
806 */ 823 */
807static void clean_blockdev_aliases(struct dio *dio) 824static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
808{ 825{
809 unsigned i; 826 unsigned i;
810 unsigned nblocks; 827 unsigned nblocks;
811 828
812 nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; 829 nblocks = map_bh->b_size >> dio->inode->i_blkbits;
813 830
814 for (i = 0; i < nblocks; i++) { 831 for (i = 0; i < nblocks; i++) {
815 unmap_underlying_metadata(dio->map_bh.b_bdev, 832 unmap_underlying_metadata(map_bh->b_bdev,
816 dio->map_bh.b_blocknr + i); 833 map_bh->b_blocknr + i);
817 } 834 }
818} 835}
819 836
@@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio)
826 * `end' is zero if we're doing the start of the IO, 1 at the end of the 843 * `end' is zero if we're doing the start of the IO, 1 at the end of the
827 * IO. 844 * IO.
828 */ 845 */
829static void dio_zero_block(struct dio *dio, int end) 846static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
847 int end, struct buffer_head *map_bh)
830{ 848{
831 unsigned dio_blocks_per_fs_block; 849 unsigned dio_blocks_per_fs_block;
832 unsigned this_chunk_blocks; /* In dio_blocks */ 850 unsigned this_chunk_blocks; /* In dio_blocks */
833 unsigned this_chunk_bytes; 851 unsigned this_chunk_bytes;
834 struct page *page; 852 struct page *page;
835 853
836 dio->start_zero_done = 1; 854 sdio->start_zero_done = 1;
837 if (!dio->blkfactor || !buffer_new(&dio->map_bh)) 855 if (!sdio->blkfactor || !buffer_new(map_bh))
838 return; 856 return;
839 857
840 dio_blocks_per_fs_block = 1 << dio->blkfactor; 858 dio_blocks_per_fs_block = 1 << sdio->blkfactor;
841 this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); 859 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
842 860
843 if (!this_chunk_blocks) 861 if (!this_chunk_blocks)
844 return; 862 return;
@@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end)
850 if (end) 868 if (end)
851 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; 869 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
852 870
853 this_chunk_bytes = this_chunk_blocks << dio->blkbits; 871 this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
854 872
855 page = ZERO_PAGE(0); 873 page = ZERO_PAGE(0);
856 if (submit_page_section(dio, page, 0, this_chunk_bytes, 874 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
857 dio->next_block_for_io)) 875 sdio->next_block_for_io, map_bh))
858 return; 876 return;
859 877
860 dio->next_block_for_io += this_chunk_blocks; 878 sdio->next_block_for_io += this_chunk_blocks;
861} 879}
862 880
863/* 881/*
@@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end)
876 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives 894 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives
877 * fine alignment but still allows this function to work in PAGE_SIZE units. 895 * fine alignment but still allows this function to work in PAGE_SIZE units.
878 */ 896 */
879static int do_direct_IO(struct dio *dio) 897static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
898 struct buffer_head *map_bh)
880{ 899{
881 const unsigned blkbits = dio->blkbits; 900 const unsigned blkbits = sdio->blkbits;
882 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 901 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
883 struct page *page; 902 struct page *page;
884 unsigned block_in_page; 903 unsigned block_in_page;
885 struct buffer_head *map_bh = &dio->map_bh;
886 int ret = 0; 904 int ret = 0;
887 905
888 /* The I/O can start at any block offset within the first page */ 906 /* The I/O can start at any block offset within the first page */
889 block_in_page = dio->first_block_in_page; 907 block_in_page = sdio->first_block_in_page;
890 908
891 while (dio->block_in_file < dio->final_block_in_request) { 909 while (sdio->block_in_file < sdio->final_block_in_request) {
892 page = dio_get_page(dio); 910 page = dio_get_page(dio, sdio);
893 if (IS_ERR(page)) { 911 if (IS_ERR(page)) {
894 ret = PTR_ERR(page); 912 ret = PTR_ERR(page);
895 goto out; 913 goto out;
@@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio)
901 unsigned this_chunk_blocks; /* # of blocks */ 919 unsigned this_chunk_blocks; /* # of blocks */
902 unsigned u; 920 unsigned u;
903 921
904 if (dio->blocks_available == 0) { 922 if (sdio->blocks_available == 0) {
905 /* 923 /*
906 * Need to go and map some more disk 924 * Need to go and map some more disk
907 */ 925 */
908 unsigned long blkmask; 926 unsigned long blkmask;
909 unsigned long dio_remainder; 927 unsigned long dio_remainder;
910 928
911 ret = get_more_blocks(dio); 929 ret = get_more_blocks(dio, sdio, map_bh);
912 if (ret) { 930 if (ret) {
913 page_cache_release(page); 931 page_cache_release(page);
914 goto out; 932 goto out;
@@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio)
916 if (!buffer_mapped(map_bh)) 934 if (!buffer_mapped(map_bh))
917 goto do_holes; 935 goto do_holes;
918 936
919 dio->blocks_available = 937 sdio->blocks_available =
920 map_bh->b_size >> dio->blkbits; 938 map_bh->b_size >> sdio->blkbits;
921 dio->next_block_for_io = 939 sdio->next_block_for_io =
922 map_bh->b_blocknr << dio->blkfactor; 940 map_bh->b_blocknr << sdio->blkfactor;
923 if (buffer_new(map_bh)) 941 if (buffer_new(map_bh))
924 clean_blockdev_aliases(dio); 942 clean_blockdev_aliases(dio, map_bh);
925 943
926 if (!dio->blkfactor) 944 if (!sdio->blkfactor)
927 goto do_holes; 945 goto do_holes;
928 946
929 blkmask = (1 << dio->blkfactor) - 1; 947 blkmask = (1 << sdio->blkfactor) - 1;
930 dio_remainder = (dio->block_in_file & blkmask); 948 dio_remainder = (sdio->block_in_file & blkmask);
931 949
932 /* 950 /*
933 * If we are at the start of IO and that IO 951 * If we are at the start of IO and that IO
@@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio)
941 * on-disk 959 * on-disk
942 */ 960 */
943 if (!buffer_new(map_bh)) 961 if (!buffer_new(map_bh))
944 dio->next_block_for_io += dio_remainder; 962 sdio->next_block_for_io += dio_remainder;
945 dio->blocks_available -= dio_remainder; 963 sdio->blocks_available -= dio_remainder;
946 } 964 }
947do_holes: 965do_holes:
948 /* Handle holes */ 966 /* Handle holes */
@@ -961,7 +979,7 @@ do_holes:
961 */ 979 */
962 i_size_aligned = ALIGN(i_size_read(dio->inode), 980 i_size_aligned = ALIGN(i_size_read(dio->inode),
963 1 << blkbits); 981 1 << blkbits);
964 if (dio->block_in_file >= 982 if (sdio->block_in_file >=
965 i_size_aligned >> blkbits) { 983 i_size_aligned >> blkbits) {
966 /* We hit eof */ 984 /* We hit eof */
967 page_cache_release(page); 985 page_cache_release(page);
@@ -969,7 +987,7 @@ do_holes:
969 } 987 }
970 zero_user(page, block_in_page << blkbits, 988 zero_user(page, block_in_page << blkbits,
971 1 << blkbits); 989 1 << blkbits);
972 dio->block_in_file++; 990 sdio->block_in_file++;
973 block_in_page++; 991 block_in_page++;
974 goto next_block; 992 goto next_block;
975 } 993 }
@@ -979,38 +997,41 @@ do_holes:
979 * is finer than the underlying fs, go check to see if 997 * is finer than the underlying fs, go check to see if
980 * we must zero out the start of this block. 998 * we must zero out the start of this block.
981 */ 999 */
982 if (unlikely(dio->blkfactor && !dio->start_zero_done)) 1000 if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
983 dio_zero_block(dio, 0); 1001 dio_zero_block(dio, sdio, 0, map_bh);
984 1002
985 /* 1003 /*
986 * Work out, in this_chunk_blocks, how much disk we 1004 * Work out, in this_chunk_blocks, how much disk we
987 * can add to this page 1005 * can add to this page
988 */ 1006 */
989 this_chunk_blocks = dio->blocks_available; 1007 this_chunk_blocks = sdio->blocks_available;
990 u = (PAGE_SIZE - offset_in_page) >> blkbits; 1008 u = (PAGE_SIZE - offset_in_page) >> blkbits;
991 if (this_chunk_blocks > u) 1009 if (this_chunk_blocks > u)
992 this_chunk_blocks = u; 1010 this_chunk_blocks = u;
993 u = dio->final_block_in_request - dio->block_in_file; 1011 u = sdio->final_block_in_request - sdio->block_in_file;
994 if (this_chunk_blocks > u) 1012 if (this_chunk_blocks > u)
995 this_chunk_blocks = u; 1013 this_chunk_blocks = u;
996 this_chunk_bytes = this_chunk_blocks << blkbits; 1014 this_chunk_bytes = this_chunk_blocks << blkbits;
997 BUG_ON(this_chunk_bytes == 0); 1015 BUG_ON(this_chunk_bytes == 0);
998 1016
999 dio->boundary = buffer_boundary(map_bh); 1017 sdio->boundary = buffer_boundary(map_bh);
1000 ret = submit_page_section(dio, page, offset_in_page, 1018 ret = submit_page_section(dio, sdio, page,
1001 this_chunk_bytes, dio->next_block_for_io); 1019 offset_in_page,
1020 this_chunk_bytes,
1021 sdio->next_block_for_io,
1022 map_bh);
1002 if (ret) { 1023 if (ret) {
1003 page_cache_release(page); 1024 page_cache_release(page);
1004 goto out; 1025 goto out;
1005 } 1026 }
1006 dio->next_block_for_io += this_chunk_blocks; 1027 sdio->next_block_for_io += this_chunk_blocks;
1007 1028
1008 dio->block_in_file += this_chunk_blocks; 1029 sdio->block_in_file += this_chunk_blocks;
1009 block_in_page += this_chunk_blocks; 1030 block_in_page += this_chunk_blocks;
1010 dio->blocks_available -= this_chunk_blocks; 1031 sdio->blocks_available -= this_chunk_blocks;
1011next_block: 1032next_block:
1012 BUG_ON(dio->block_in_file > dio->final_block_in_request); 1033 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1013 if (dio->block_in_file == dio->final_block_in_request) 1034 if (sdio->block_in_file == sdio->final_block_in_request)
1014 break; 1035 break;
1015 } 1036 }
1016 1037
@@ -1022,135 +1043,10 @@ out:
1022 return ret; 1043 return ret;
1023} 1044}
1024 1045
1025static ssize_t 1046static inline int drop_refcount(struct dio *dio)
1026direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1027 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
1028 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
1029 dio_submit_t submit_io, struct dio *dio)
1030{ 1047{
1031 unsigned long user_addr; 1048 int ret2;
1032 unsigned long flags; 1049 unsigned long flags;
1033 int seg;
1034 ssize_t ret = 0;
1035 ssize_t ret2;
1036 size_t bytes;
1037
1038 dio->inode = inode;
1039 dio->rw = rw;
1040 dio->blkbits = blkbits;
1041 dio->blkfactor = inode->i_blkbits - blkbits;
1042 dio->block_in_file = offset >> blkbits;
1043
1044 dio->get_block = get_block;
1045 dio->end_io = end_io;
1046 dio->submit_io = submit_io;
1047 dio->final_block_in_bio = -1;
1048 dio->next_block_for_io = -1;
1049
1050 dio->iocb = iocb;
1051 dio->i_size = i_size_read(inode);
1052
1053 spin_lock_init(&dio->bio_lock);
1054 dio->refcount = 1;
1055
1056 /*
1057 * In case of non-aligned buffers, we may need 2 more
1058 * pages since we need to zero out first and last block.
1059 */
1060 if (unlikely(dio->blkfactor))
1061 dio->pages_in_io = 2;
1062
1063 for (seg = 0; seg < nr_segs; seg++) {
1064 user_addr = (unsigned long)iov[seg].iov_base;
1065 dio->pages_in_io +=
1066 ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
1067 - user_addr/PAGE_SIZE);
1068 }
1069
1070 for (seg = 0; seg < nr_segs; seg++) {
1071 user_addr = (unsigned long)iov[seg].iov_base;
1072 dio->size += bytes = iov[seg].iov_len;
1073
1074 /* Index into the first page of the first block */
1075 dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1076 dio->final_block_in_request = dio->block_in_file +
1077 (bytes >> blkbits);
1078 /* Page fetching state */
1079 dio->head = 0;
1080 dio->tail = 0;
1081 dio->curr_page = 0;
1082
1083 dio->total_pages = 0;
1084 if (user_addr & (PAGE_SIZE-1)) {
1085 dio->total_pages++;
1086 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1087 }
1088 dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1089 dio->curr_user_address = user_addr;
1090
1091 ret = do_direct_IO(dio);
1092
1093 dio->result += iov[seg].iov_len -
1094 ((dio->final_block_in_request - dio->block_in_file) <<
1095 blkbits);
1096
1097 if (ret) {
1098 dio_cleanup(dio);
1099 break;
1100 }
1101 } /* end iovec loop */
1102
1103 if (ret == -ENOTBLK) {
1104 /*
1105 * The remaining part of the request will be
1106 * be handled by buffered I/O when we return
1107 */
1108 ret = 0;
1109 }
1110 /*
1111 * There may be some unwritten disk at the end of a part-written
1112 * fs-block-sized block. Go zero that now.
1113 */
1114 dio_zero_block(dio, 1);
1115
1116 if (dio->cur_page) {
1117 ret2 = dio_send_cur_page(dio);
1118 if (ret == 0)
1119 ret = ret2;
1120 page_cache_release(dio->cur_page);
1121 dio->cur_page = NULL;
1122 }
1123 if (dio->bio)
1124 dio_bio_submit(dio);
1125
1126 /*
1127 * It is possible that, we return short IO due to end of file.
1128 * In that case, we need to release all the pages we got hold on.
1129 */
1130 dio_cleanup(dio);
1131
1132 /*
1133 * All block lookups have been performed. For READ requests
1134 * we can let i_mutex go now that its achieved its purpose
1135 * of protecting us from looking up uninitialized blocks.
1136 */
1137 if (rw == READ && (dio->flags & DIO_LOCKING))
1138 mutex_unlock(&dio->inode->i_mutex);
1139
1140 /*
1141 * The only time we want to leave bios in flight is when a successful
1142 * partial aio read or full aio write have been setup. In that case
1143 * bio completion will call aio_complete. The only time it's safe to
1144 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1145 * This had *better* be the only place that raises -EIOCBQUEUED.
1146 */
1147 BUG_ON(ret == -EIOCBQUEUED);
1148 if (dio->is_async && ret == 0 && dio->result &&
1149 ((rw & READ) || (dio->result == dio->size)))
1150 ret = -EIOCBQUEUED;
1151
1152 if (ret != -EIOCBQUEUED)
1153 dio_await_completion(dio);
1154 1050
1155 /* 1051 /*
1156 * Sync will always be dropping the final ref and completing the 1052 * Sync will always be dropping the final ref and completing the
@@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1166 spin_lock_irqsave(&dio->bio_lock, flags); 1062 spin_lock_irqsave(&dio->bio_lock, flags);
1167 ret2 = --dio->refcount; 1063 ret2 = --dio->refcount;
1168 spin_unlock_irqrestore(&dio->bio_lock, flags); 1064 spin_unlock_irqrestore(&dio->bio_lock, flags);
1169 1065 return ret2;
1170 if (ret2 == 0) {
1171 ret = dio_complete(dio, offset, ret, false);
1172 kfree(dio);
1173 } else
1174 BUG_ON(ret != -EIOCBQUEUED);
1175
1176 return ret;
1177} 1066}
1178 1067
1179/* 1068/*
@@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1195 * expected that filesystem provide exclusion between new direct I/O 1084 * expected that filesystem provide exclusion between new direct I/O
1196 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, 1085 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
1197 * but other filesystems need to take care of this on their own. 1086 * but other filesystems need to take care of this on their own.
1087 *
1088 * NOTE: if you pass "sdio" to anything by pointer make sure that function
1089 * is always inlined. Otherwise gcc is unable to split the structure into
1090 * individual fields and will generate much worse code. This is important
1091 * for the whole file.
1198 */ 1092 */
1199ssize_t 1093ssize_t
1200__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1211 ssize_t retval = -EINVAL; 1105 ssize_t retval = -EINVAL;
1212 loff_t end = offset; 1106 loff_t end = offset;
1213 struct dio *dio; 1107 struct dio *dio;
1108 struct dio_submit sdio = { 0, };
1109 unsigned long user_addr;
1110 size_t bytes;
1111 struct buffer_head map_bh = { 0, };
1214 1112
1215 if (rw & WRITE) 1113 if (rw & WRITE)
1216 rw = WRITE_ODIRECT; 1114 rw = WRITE_ODIRECT;
@@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1244 if (rw == READ && end == offset) 1142 if (rw == READ && end == offset)
1245 return 0; 1143 return 0;
1246 1144
1247 dio = kmalloc(sizeof(*dio), GFP_KERNEL); 1145 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1248 retval = -ENOMEM; 1146 retval = -ENOMEM;
1249 if (!dio) 1147 if (!dio)
1250 goto out; 1148 goto out;
@@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1268 end - 1); 1166 end - 1);
1269 if (retval) { 1167 if (retval) {
1270 mutex_unlock(&inode->i_mutex); 1168 mutex_unlock(&inode->i_mutex);
1271 kfree(dio); 1169 kmem_cache_free(dio_cache, dio);
1272 goto out; 1170 goto out;
1273 } 1171 }
1274 } 1172 }
@@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1288 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1186 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1289 (end > i_size_read(inode))); 1187 (end > i_size_read(inode)));
1290 1188
1291 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1189 retval = 0;
1292 nr_segs, blkbits, get_block, end_io, 1190
1293 submit_io, dio); 1191 dio->inode = inode;
1192 dio->rw = rw;
1193 sdio.blkbits = blkbits;
1194 sdio.blkfactor = inode->i_blkbits - blkbits;
1195 sdio.block_in_file = offset >> blkbits;
1196
1197 sdio.get_block = get_block;
1198 dio->end_io = end_io;
1199 sdio.submit_io = submit_io;
1200 sdio.final_block_in_bio = -1;
1201 sdio.next_block_for_io = -1;
1202
1203 dio->iocb = iocb;
1204 dio->i_size = i_size_read(inode);
1205
1206 spin_lock_init(&dio->bio_lock);
1207 dio->refcount = 1;
1208
1209 /*
1210 * In case of non-aligned buffers, we may need 2 more
1211 * pages since we need to zero out first and last block.
1212 */
1213 if (unlikely(sdio.blkfactor))
1214 sdio.pages_in_io = 2;
1215
1216 for (seg = 0; seg < nr_segs; seg++) {
1217 user_addr = (unsigned long)iov[seg].iov_base;
1218 sdio.pages_in_io +=
1219 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1220 PAGE_SIZE - user_addr / PAGE_SIZE);
1221 }
1222
1223 for (seg = 0; seg < nr_segs; seg++) {
1224 user_addr = (unsigned long)iov[seg].iov_base;
1225 sdio.size += bytes = iov[seg].iov_len;
1226
1227 /* Index into the first page of the first block */
1228 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1229 sdio.final_block_in_request = sdio.block_in_file +
1230 (bytes >> blkbits);
1231 /* Page fetching state */
1232 sdio.head = 0;
1233 sdio.tail = 0;
1234 sdio.curr_page = 0;
1235
1236 sdio.total_pages = 0;
1237 if (user_addr & (PAGE_SIZE-1)) {
1238 sdio.total_pages++;
1239 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1240 }
1241 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1242 sdio.curr_user_address = user_addr;
1243
1244 retval = do_direct_IO(dio, &sdio, &map_bh);
1245
1246 dio->result += iov[seg].iov_len -
1247 ((sdio.final_block_in_request - sdio.block_in_file) <<
1248 blkbits);
1249
1250 if (retval) {
1251 dio_cleanup(dio, &sdio);
1252 break;
1253 }
1254 } /* end iovec loop */
1255
1256 if (retval == -ENOTBLK) {
1257 /*
1258 * The remaining part of the request will be
1259 * be handled by buffered I/O when we return
1260 */
1261 retval = 0;
1262 }
1263 /*
1264 * There may be some unwritten disk at the end of a part-written
1265 * fs-block-sized block. Go zero that now.
1266 */
1267 dio_zero_block(dio, &sdio, 1, &map_bh);
1268
1269 if (sdio.cur_page) {
1270 ssize_t ret2;
1271
1272 ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1273 if (retval == 0)
1274 retval = ret2;
1275 page_cache_release(sdio.cur_page);
1276 sdio.cur_page = NULL;
1277 }
1278 if (sdio.bio)
1279 dio_bio_submit(dio, &sdio);
1280
1281 /*
1282 * It is possible that, we return short IO due to end of file.
1283 * In that case, we need to release all the pages we got hold on.
1284 */
1285 dio_cleanup(dio, &sdio);
1286
1287 /*
1288 * All block lookups have been performed. For READ requests
1289 * we can let i_mutex go now that its achieved its purpose
1290 * of protecting us from looking up uninitialized blocks.
1291 */
1292 if (rw == READ && (dio->flags & DIO_LOCKING))
1293 mutex_unlock(&dio->inode->i_mutex);
1294
1295 /*
1296 * The only time we want to leave bios in flight is when a successful
1297 * partial aio read or full aio write have been setup. In that case
1298 * bio completion will call aio_complete. The only time it's safe to
1299 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1300 * This had *better* be the only place that raises -EIOCBQUEUED.
1301 */
1302 BUG_ON(retval == -EIOCBQUEUED);
1303 if (dio->is_async && retval == 0 && dio->result &&
1304 ((rw & READ) || (dio->result == sdio.size)))
1305 retval = -EIOCBQUEUED;
1306
1307 if (retval != -EIOCBQUEUED)
1308 dio_await_completion(dio);
1309
1310 if (drop_refcount(dio) == 0) {
1311 retval = dio_complete(dio, offset, retval, false);
1312 kmem_cache_free(dio_cache, dio);
1313 } else
1314 BUG_ON(retval != -EIOCBQUEUED);
1294 1315
1295out: 1316out:
1296 return retval; 1317 return retval;
1297} 1318}
1298EXPORT_SYMBOL(__blockdev_direct_IO); 1319EXPORT_SYMBOL(__blockdev_direct_IO);
1320
1321static __init int dio_init(void)
1322{
1323 dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1324 return 0;
1325}
1326module_init(dio_init)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b36c5572b3f..54481a3b2c7 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
514 514
515#define ecryptfs_printk(type, fmt, arg...) \ 515#define ecryptfs_printk(type, fmt, arg...) \
516 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); 516 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
517__attribute__ ((format(printf, 1, 2))) 517__printf(1, 2)
518void __ecryptfs_printk(const char *fmt, ...); 518void __ecryptfs_printk(const char *fmt, ...);
519 519
520extern const struct file_operations ecryptfs_main_fops; 520extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 11f8582d721..a36d327f152 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -474,8 +474,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
474 goto out_lock; 474 goto out_lock;
475 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 475 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
476 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); 476 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
477 old_dentry->d_inode->i_nlink = 477 set_nlink(old_dentry->d_inode,
478 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; 478 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink);
479 i_size_write(new_dentry->d_inode, file_size_save); 479 i_size_write(new_dentry->d_inode, file_size_save);
480out_lock: 480out_lock:
481 unlock_dir(lower_dir_dentry); 481 unlock_dir(lower_dir_dentry);
@@ -499,8 +499,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
499 goto out_unlock; 499 goto out_unlock;
500 } 500 }
501 fsstack_copy_attr_times(dir, lower_dir_inode); 501 fsstack_copy_attr_times(dir, lower_dir_inode);
502 dentry->d_inode->i_nlink = 502 set_nlink(dentry->d_inode,
503 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; 503 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
504 dentry->d_inode->i_ctime = dir->i_ctime; 504 dentry->d_inode->i_ctime = dir->i_ctime;
505 d_drop(dentry); 505 d_drop(dentry);
506out_unlock: 506out_unlock:
@@ -565,7 +565,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
565 goto out; 565 goto out;
566 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 566 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
567 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); 567 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
568 dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; 568 set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
569out: 569out:
570 unlock_dir(lower_dir_dentry); 570 unlock_dir(lower_dir_dentry);
571 if (!dentry->d_inode) 571 if (!dentry->d_inode)
@@ -588,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
588 if (!rc && dentry->d_inode) 588 if (!rc && dentry->d_inode)
589 clear_nlink(dentry->d_inode); 589 clear_nlink(dentry->d_inode);
590 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 590 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
591 dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; 591 set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
592 unlock_dir(lower_dir_dentry); 592 unlock_dir(lower_dir_dentry);
593 if (!rc) 593 if (!rc)
594 d_drop(dentry); 594 d_drop(dentry);
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 9c13412e6c9..bc84f365d75 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
96 efs_inode = (struct efs_dinode *) (bh->b_data + offset); 96 efs_inode = (struct efs_dinode *) (bh->b_data + offset);
97 97
98 inode->i_mode = be16_to_cpu(efs_inode->di_mode); 98 inode->i_mode = be16_to_cpu(efs_inode->di_mode);
99 inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); 99 set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
100 inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); 100 inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid);
101 inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); 101 inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid);
102 inode->i_size = be32_to_cpu(efs_inode->di_size); 102 inode->i_size = be32_to_cpu(efs_inode->di_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9026fc91fe3..828e750af23 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
70 * simultaneous inserts (A into B and B into A) from racing and 70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is 71 * constructing a cycle without either insert observing that it is
72 * going to. 72 * going to.
73 * It is necessary to acquire multiple "ep->mtx"es at once in the
74 * case when one epoll fd is added to another. In this case, we
75 * always acquire the locks in the order of nesting (i.e. after
76 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
77 * before e2->mtx). Since we disallow cycles of epoll file
78 * descriptors, this ensures that the mutexes are well-ordered. In
79 * order to communicate this nesting to lockdep, when walking a tree
80 * of epoll file descriptors, we use the current recursion depth as
81 * the lockdep subkey.
73 * It is possible to drop the "ep->mtx" and to use the global 82 * It is possible to drop the "ep->mtx" and to use the global
74 * mutex "epmutex" (together with "ep->lock") to have it working, 83 * mutex "epmutex" (together with "ep->lock") to have it working,
75 * but having "ep->mtx" will make the interface more scalable. 84 * but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
464 * @ep: Pointer to the epoll private data structure. 473 * @ep: Pointer to the epoll private data structure.
465 * @sproc: Pointer to the scan callback. 474 * @sproc: Pointer to the scan callback.
466 * @priv: Private opaque data passed to the @sproc callback. 475 * @priv: Private opaque data passed to the @sproc callback.
476 * @depth: The current depth of recursive f_op->poll calls.
467 * 477 *
468 * Returns: The same integer error code returned by the @sproc callback. 478 * Returns: The same integer error code returned by the @sproc callback.
469 */ 479 */
470static int ep_scan_ready_list(struct eventpoll *ep, 480static int ep_scan_ready_list(struct eventpoll *ep,
471 int (*sproc)(struct eventpoll *, 481 int (*sproc)(struct eventpoll *,
472 struct list_head *, void *), 482 struct list_head *, void *),
473 void *priv) 483 void *priv,
484 int depth)
474{ 485{
475 int error, pwake = 0; 486 int error, pwake = 0;
476 unsigned long flags; 487 unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
481 * We need to lock this because we could be hit by 492 * We need to lock this because we could be hit by
482 * eventpoll_release_file() and epoll_ctl(). 493 * eventpoll_release_file() and epoll_ctl().
483 */ 494 */
484 mutex_lock(&ep->mtx); 495 mutex_lock_nested(&ep->mtx, depth);
485 496
486 /* 497 /*
487 * Steal the ready list, and re-init the original one to the 498 * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
670 681
671static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) 682static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
672{ 683{
673 return ep_scan_ready_list(priv, ep_read_events_proc, NULL); 684 return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
674} 685}
675 686
676static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 687static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
737 748
738 ep = epi->ep; 749 ep = epi->ep;
739 list_del_init(&epi->fllink); 750 list_del_init(&epi->fllink);
740 mutex_lock(&ep->mtx); 751 mutex_lock_nested(&ep->mtx, 0);
741 ep_remove(ep, epi); 752 ep_remove(ep, epi);
742 mutex_unlock(&ep->mtx); 753 mutex_unlock(&ep->mtx);
743 } 754 }
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
1134 esed.maxevents = maxevents; 1145 esed.maxevents = maxevents;
1135 esed.events = events; 1146 esed.events = events;
1136 1147
1137 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1148 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1138} 1149}
1139 1150
1140static inline struct timespec ep_set_mstimeout(long ms) 1151static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1267 struct rb_node *rbp; 1278 struct rb_node *rbp;
1268 struct epitem *epi; 1279 struct epitem *epi;
1269 1280
1270 mutex_lock(&ep->mtx); 1281 mutex_lock_nested(&ep->mtx, call_nests + 1);
1271 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1272 epi = rb_entry(rbp, struct epitem, rbn); 1283 epi = rb_entry(rbp, struct epitem, rbn);
1273 if (unlikely(is_file_epoll(epi->ffd.file))) { 1284 if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1409 } 1420 }
1410 1421
1411 1422
1412 mutex_lock(&ep->mtx); 1423 mutex_lock_nested(&ep->mtx, 0);
1413 1424
1414 /* 1425 /*
1415 * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 1426 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/exec.c b/fs/exec.c
index 25dcbe5fc35..36254645b7c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
841 tsk->mm = mm; 841 tsk->mm = mm;
842 tsk->active_mm = mm; 842 tsk->active_mm = mm;
843 activate_mm(active_mm, mm); 843 activate_mm(active_mm, mm);
844 if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
845 atomic_dec(&old_mm->oom_disable_count);
846 atomic_inc(&tsk->mm->oom_disable_count);
847 }
848 task_unlock(tsk); 844 task_unlock(tsk);
849 arch_pick_mmap_layout(mm); 845 arch_pick_mmap_layout(mm);
850 if (old_mm) { 846 if (old_mm) {
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c4..352ba149d23 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
13# 13#
14 14
15# ore module library 15# ore module library
16obj-$(CONFIG_ORE) += ore.o 16libore-y := ore.o ore_raid.o
17obj-$(CONFIG_ORE) += libore.o
17 18
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o 19exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
19obj-$(CONFIG_EXOFS_FS) += exofs.o 20obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae414929..da42f32c49b 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
1# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
2# for every ORE user we do it like this. Any user should add itself here
3# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
4# selected here, and we default to "ON". So in effect it is like been
5# selected by any of the users.
1config ORE 6config ORE
2 tristate 7 tristate
8 depends on EXOFS_FS || PNFS_OBJLAYOUT
9 select ASYNC_XOR
10 default SCSI_OSD_ULD
3 11
4config EXOFS_FS 12config EXOFS_FS
5 tristate "exofs: OSD based file system support" 13 tristate "exofs: OSD based file system support"
6 depends on SCSI_OSD_ULD 14 depends on SCSI_OSD_ULD
7 select ORE
8 help 15 help
9 EXOFS is a file system that uses an OSD storage device, 16 EXOFS is a file system that uses an OSD storage device,
10 as its backing storage. 17 as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec744..51f4b4c40f0 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
53/* u64 has problems with printk this will cast it to unsigned long long */ 53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x) 54#define _LLU(x) (unsigned long long)(x)
55 55
56struct exofs_dev {
57 struct ore_dev ored;
58 unsigned did;
59};
56/* 60/*
57 * our extension to the in-memory superblock 61 * our extension to the in-memory superblock
58 */ 62 */
@@ -66,13 +70,9 @@ struct exofs_sb_info {
66 u32 s_next_generation; /* next gen # to use */ 70 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */ 71 atomic_t s_curr_pending; /* number of pending commands */
68 72
69 struct pnfs_osd_data_map data_map; /* Default raid to use
70 * FIXME: Needed ?
71 */
72 struct ore_layout layout; /* Default files layout */ 73 struct ore_layout layout; /* Default files layout */
73 struct ore_comp one_comp; /* id & cred of partition id=0*/ 74 struct ore_comp one_comp; /* id & cred of partition id=0*/
74 struct ore_components comps; /* comps for the partition */ 75 struct ore_components oc; /* comps for the partition */
75 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
76}; 76};
77 77
78/* 78/*
@@ -86,7 +86,7 @@ struct exofs_i_info {
86 uint32_t i_dir_start_lookup; /* which page to start lookup */ 86 uint32_t i_dir_start_lookup; /* which page to start lookup */
87 uint64_t i_commit_size; /* the object's written length */ 87 uint64_t i_commit_size; /* the object's written length */
88 struct ore_comp one_comp; /* same component for all devices */ 88 struct ore_comp one_comp; /* same component for all devices */
89 struct ore_components comps; /* inode view of the device table */ 89 struct ore_components oc; /* inode view of the device table */
90}; 90};
91 91
92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
207 * bigger and that the device table repeats twice. 207 * bigger and that the device table repeats twice.
208 * See: exofs_read_lookup_dev_table() 208 * See: exofs_read_lookup_dev_table()
209 */ 209 */
210static inline void exofs_init_comps(struct ore_components *comps, 210static inline void exofs_init_comps(struct ore_components *oc,
211 struct ore_comp *one_comp, 211 struct ore_comp *one_comp,
212 struct exofs_sb_info *sbi, osd_id oid) 212 struct exofs_sb_info *sbi, osd_id oid)
213{ 213{
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
217 one_comp->obj.id = oid; 217 one_comp->obj.id = oid;
218 exofs_make_credential(one_comp->cred, &one_comp->obj); 218 exofs_make_credential(one_comp->cred, &one_comp->obj);
219 219
220 comps->numdevs = sbi->comps.numdevs; 220 oc->first_dev = 0;
221 comps->single_comp = EC_SINGLE_COMP; 221 oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
222 comps->comps = one_comp; 222 sbi->layout.group_count;
223 oc->single_comp = EC_SINGLE_COMP;
224 oc->comps = one_comp;
223 225
224 /* Round robin device view of the table */ 226 /* Round robin device view of the table */
225 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; 227 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
226 comps->ods = sbi->comps.ods + first_dev; 228 oc->ods = &sbi->oc.ods[first_dev];
227} 229}
228 230
229#endif 231#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc234..f6dbf7768ce 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
37 37
38#define EXOFS_DBGMSG2(M...) do {} while (0) 38#define EXOFS_DBGMSG2(M...) do {} while (0)
39 39
40enum { BIO_MAX_PAGES_KMALLOC = 40enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
41 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
42 MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *),
44};
45 41
46unsigned exofs_max_io_pages(struct ore_layout *layout, 42unsigned exofs_max_io_pages(struct ore_layout *layout,
47 unsigned expected_pages) 43 unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 45 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
50 46
51 /* TODO: easily support bio chaining */ 47 /* TODO: easily support bio chaining */
52 pages = min_t(unsigned, pages, 48 pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
53 layout->group_width * BIO_MAX_PAGES_KMALLOC);
54 return pages; 49 return pages;
55} 50}
56 51
@@ -68,6 +63,7 @@ struct page_collect {
68 bool read_4_write; /* This means two things: that the read is sync 63 bool read_4_write; /* This means two things: that the read is sync
69 * And the pages should not be unlocked. 64 * And the pages should not be unlocked.
70 */ 65 */
66 struct page *that_locked_page;
71}; 67};
72 68
73static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 69static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
86 pcol->length = 0; 82 pcol->length = 0;
87 pcol->pg_first = -1; 83 pcol->pg_first = -1;
88 pcol->read_4_write = false; 84 pcol->read_4_write = false;
85 pcol->that_locked_page = NULL;
89} 86}
90 87
91static void _pcol_reset(struct page_collect *pcol) 88static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
98 pcol->length = 0; 95 pcol->length = 0;
99 pcol->pg_first = -1; 96 pcol->pg_first = -1;
100 pcol->ios = NULL; 97 pcol->ios = NULL;
98 pcol->that_locked_page = NULL;
101 99
102 /* this is probably the end of the loop but in writes 100 /* this is probably the end of the loop but in writes
103 * it might not end here. don't be left with nothing 101 * it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
149 return 0; 147 return 0;
150} 148}
151 149
150enum {PAGE_WAS_NOT_IN_IO = 17};
152static int update_read_page(struct page *page, int ret) 151static int update_read_page(struct page *page, int ret)
153{ 152{
154 if (ret == 0) { 153 switch (ret) {
154 case 0:
155 /* Everything is OK */ 155 /* Everything is OK */
156 SetPageUptodate(page); 156 SetPageUptodate(page);
157 if (PageError(page)) 157 if (PageError(page))
158 ClearPageError(page); 158 ClearPageError(page);
159 } else if (ret == -EFAULT) { 159 break;
160 case -EFAULT:
160 /* In this case we were trying to read something that wasn't on 161 /* In this case we were trying to read something that wasn't on
161 * disk yet - return a page full of zeroes. This should be OK, 162 * disk yet - return a page full of zeroes. This should be OK,
162 * because the object should be empty (if there was a write 163 * because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
167 SetPageUptodate(page); 168 SetPageUptodate(page);
168 if (PageError(page)) 169 if (PageError(page))
169 ClearPageError(page); 170 ClearPageError(page);
170 ret = 0; /* recovered error */
171 EXOFS_DBGMSG("recovered read error\n"); 171 EXOFS_DBGMSG("recovered read error\n");
172 } else /* Error */ 172 /* fall through */
173 case PAGE_WAS_NOT_IN_IO:
174 ret = 0; /* recovered error */
175 break;
176 default:
173 SetPageError(page); 177 SetPageError(page);
174 178 }
175 return ret; 179 return ret;
176} 180}
177 181
178static void update_write_page(struct page *page, int ret) 182static void update_write_page(struct page *page, int ret)
179{ 183{
184 if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
185 return; /* don't pass start don't collect $200 */
186
180 if (ret) { 187 if (ret) {
181 mapping_set_error(page->mapping, ret); 188 mapping_set_error(page->mapping, ret);
182 SetPageError(page); 189 SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
190static int __readpages_done(struct page_collect *pcol) 197static int __readpages_done(struct page_collect *pcol)
191{ 198{
192 int i; 199 int i;
193 u64 resid;
194 u64 good_bytes; 200 u64 good_bytes;
195 u64 length = 0; 201 u64 length = 0;
196 int ret = ore_check_io(pcol->ios, &resid); 202 int ret = ore_check_io(pcol->ios, NULL);
197 203
198 if (likely(!ret)) 204 if (likely(!ret)) {
199 good_bytes = pcol->length; 205 good_bytes = pcol->length;
200 else 206 ret = PAGE_WAS_NOT_IN_IO;
201 good_bytes = pcol->length - resid; 207 } else {
208 good_bytes = 0;
209 }
202 210
203 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" 211 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
204 " length=0x%lx nr_pages=%u\n", 212 " length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
259 } 267 }
260} 268}
261 269
270static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
271 struct page_collect *pcol_src, struct page_collect *pcol)
272{
273 /* length was wrong or offset was not page aligned */
274 BUG_ON(pcol_src->nr_pages < ios->nr_pages);
275
276 if (pcol_src->nr_pages > ios->nr_pages) {
277 struct page **src_page;
278 unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
279 unsigned long len_less = pcol_src->length - ios->length;
280 unsigned i;
281 int ret;
282
283 /* This IO was trimmed */
284 pcol_src->nr_pages = ios->nr_pages;
285 pcol_src->length = ios->length;
286
287 /* Left over pages are passed to the next io */
288 pcol->expected_pages += pages_less;
289 pcol->nr_pages = pages_less;
290 pcol->length = len_less;
291 src_page = pcol_src->pages + pcol_src->nr_pages;
292 pcol->pg_first = (*src_page)->index;
293
294 ret = pcol_try_alloc(pcol);
295 if (unlikely(ret))
296 return ret;
297
298 for (i = 0; i < pages_less; ++i)
299 pcol->pages[i] = *src_page++;
300
301 EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
302 "pages_less=0x%x expected_pages=0x%x "
303 "next_offset=0x%llx next_len=0x%lx\n",
304 pcol_src->nr_pages, pages_less, pcol->expected_pages,
305 pcol->pg_first * PAGE_SIZE, pcol->length);
306 }
307 return 0;
308}
309
262static int read_exec(struct page_collect *pcol) 310static int read_exec(struct page_collect *pcol)
263{ 311{
264 struct exofs_i_info *oi = exofs_i(pcol->inode); 312 struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
270 return 0; 318 return 0;
271 319
272 if (!pcol->ios) { 320 if (!pcol->ios) {
273 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, 321 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
274 pcol->pg_first << PAGE_CACHE_SHIFT, 322 pcol->pg_first << PAGE_CACHE_SHIFT,
275 pcol->length, &pcol->ios); 323 pcol->length, &pcol->ios);
276 324
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
280 328
281 ios = pcol->ios; 329 ios = pcol->ios;
282 ios->pages = pcol->pages; 330 ios->pages = pcol->pages;
283 ios->nr_pages = pcol->nr_pages;
284 331
285 if (pcol->read_4_write) { 332 if (pcol->read_4_write) {
286 ore_read(pcol->ios); 333 ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
296 *pcol_copy = *pcol; 343 *pcol_copy = *pcol;
297 ios->done = readpages_done; 344 ios->done = readpages_done;
298 ios->private = pcol_copy; 345 ios->private = pcol_copy;
346
347 /* pages ownership was passed to pcol_copy */
348 _pcol_reset(pcol);
349
350 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
351 if (unlikely(ret))
352 goto err;
353
354 EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
355 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
356
299 ret = ore_read(ios); 357 ret = ore_read(ios);
300 if (unlikely(ret)) 358 if (unlikely(ret))
301 goto err; 359 goto err;
302 360
303 atomic_inc(&pcol->sbi->s_curr_pending); 361 atomic_inc(&pcol->sbi->s_curr_pending);
304 362
305 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
306 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
307
308 /* pages ownership was passed to pcol_copy */
309 _pcol_reset(pcol);
310 return 0; 363 return 0;
311 364
312err: 365err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
341 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, 394 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
342 page->index); 395 page->index);
343 396
397 pcol->that_locked_page = page;
398
344 if (page->index < end_index) 399 if (page->index < end_index)
345 len = PAGE_CACHE_SIZE; 400 len = PAGE_CACHE_SIZE;
346 else if (page->index == end_index) 401 else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
429 return ret; 484 return ret;
430 } 485 }
431 486
487 ret = read_exec(&pcol);
488 if (unlikely(ret))
489 return ret;
490
432 return read_exec(&pcol); 491 return read_exec(&pcol);
433} 492}
434 493
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
462{ 521{
463 struct page_collect *pcol = p; 522 struct page_collect *pcol = p;
464 int i; 523 int i;
465 u64 resid;
466 u64 good_bytes; 524 u64 good_bytes;
467 u64 length = 0; 525 u64 length = 0;
468 int ret = ore_check_io(ios, &resid); 526 int ret = ore_check_io(ios, NULL);
469 527
470 atomic_dec(&pcol->sbi->s_curr_pending); 528 atomic_dec(&pcol->sbi->s_curr_pending);
471 529
472 if (likely(!ret)) 530 if (likely(!ret)) {
473 good_bytes = pcol->length; 531 good_bytes = pcol->length;
474 else 532 ret = PAGE_WAS_NOT_IN_IO;
475 good_bytes = pcol->length - resid; 533 } else {
534 good_bytes = 0;
535 }
476 536
477 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" 537 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
478 " length=0x%lx nr_pages=%u\n", 538 " length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
505 EXOFS_DBGMSG2("writepages_done END\n"); 565 EXOFS_DBGMSG2("writepages_done END\n");
506} 566}
507 567
568static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
569{
570 struct page_collect *pcol = priv;
571 pgoff_t index = offset / PAGE_SIZE;
572
573 if (!pcol->that_locked_page ||
574 (pcol->that_locked_page->index != index)) {
575 struct page *page = find_get_page(pcol->inode->i_mapping, index);
576
577 if (!page) {
578 page = find_or_create_page(pcol->inode->i_mapping,
579 index, GFP_NOFS);
580 if (unlikely(!page)) {
581 EXOFS_DBGMSG("grab_cache_page Failed "
582 "index=0x%llx\n", _LLU(index));
583 return NULL;
584 }
585 unlock_page(page);
586 }
587 if (PageDirty(page) || PageWriteback(page))
588 *uptodate = true;
589 else
590 *uptodate = PageUptodate(page);
591 EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
592 return page;
593 } else {
594 EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
595 pcol->that_locked_page->index);
596 *uptodate = true;
597 return pcol->that_locked_page;
598 }
599}
600
601static void __r4w_put_page(void *priv, struct page *page)
602{
603 struct page_collect *pcol = priv;
604
605 if (pcol->that_locked_page != page) {
606 EXOFS_DBGMSG("index=0x%lx\n", page->index);
607 page_cache_release(page);
608 return;
609 }
610 EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
611}
612
613static const struct _ore_r4w_op _r4w_op = {
614 .get_page = &__r4w_get_page,
615 .put_page = &__r4w_put_page,
616};
617
508static int write_exec(struct page_collect *pcol) 618static int write_exec(struct page_collect *pcol)
509{ 619{
510 struct exofs_i_info *oi = exofs_i(pcol->inode); 620 struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
516 return 0; 626 return 0;
517 627
518 BUG_ON(pcol->ios); 628 BUG_ON(pcol->ios);
519 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, 629 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
520 pcol->pg_first << PAGE_CACHE_SHIFT, 630 pcol->pg_first << PAGE_CACHE_SHIFT,
521 pcol->length, &pcol->ios); 631 pcol->length, &pcol->ios);
522
523 if (unlikely(ret)) 632 if (unlikely(ret))
524 goto err; 633 goto err;
525 634
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
534 643
535 ios = pcol->ios; 644 ios = pcol->ios;
536 ios->pages = pcol_copy->pages; 645 ios->pages = pcol_copy->pages;
537 ios->nr_pages = pcol_copy->nr_pages;
538 ios->done = writepages_done; 646 ios->done = writepages_done;
647 ios->r4w = &_r4w_op;
539 ios->private = pcol_copy; 648 ios->private = pcol_copy;
540 649
650 /* pages ownership was passed to pcol_copy */
651 _pcol_reset(pcol);
652
653 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
654 if (unlikely(ret))
655 goto err;
656
657 EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
658 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
659
541 ret = ore_write(ios); 660 ret = ore_write(ios);
542 if (unlikely(ret)) { 661 if (unlikely(ret)) {
543 EXOFS_ERR("write_exec: ore_write() Failed\n"); 662 EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
545 } 664 }
546 665
547 atomic_inc(&pcol->sbi->s_curr_pending); 666 atomic_inc(&pcol->sbi->s_curr_pending);
548 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
549 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
550 pcol->length);
551 /* pages ownership was passed to pcol_copy */
552 _pcol_reset(pcol);
553 return 0; 667 return 0;
554 668
555err: 669err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
689 _pcol_init(&pcol, expected_pages, mapping->host); 803 _pcol_init(&pcol, expected_pages, mapping->host);
690 804
691 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); 805 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
692 if (ret) { 806 if (unlikely(ret)) {
693 EXOFS_ERR("write_cache_pages => %d\n", ret); 807 EXOFS_ERR("write_cache_pages => %d\n", ret);
694 return ret; 808 return ret;
695 } 809 }
696 810
697 return write_exec(&pcol); 811 ret = write_exec(&pcol);
812 if (unlikely(ret))
813 return ret;
814
815 if (wbc->sync_mode == WB_SYNC_ALL) {
816 return write_exec(&pcol); /* pump the last reminder */
817 } else if (pcol.nr_pages) {
818 /* not SYNC let the reminder join the next writeout */
819 unsigned i;
820
821 for (i = 0; i < pcol.nr_pages; i++) {
822 struct page *page = pcol.pages[i];
823
824 end_page_writeback(page);
825 set_page_dirty(page);
826 unlock_page(page);
827 }
828 }
829 return 0;
698} 830}
699 831
832/*
700static int exofs_writepage(struct page *page, struct writeback_control *wbc) 833static int exofs_writepage(struct page *page, struct writeback_control *wbc)
701{ 834{
702 struct page_collect pcol; 835 struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
712 845
713 return write_exec(&pcol); 846 return write_exec(&pcol);
714} 847}
715 848*/
716/* i_mutex held using inode->i_size directly */ 849/* i_mutex held using inode->i_size directly */
717static void _write_failed(struct inode *inode, loff_t to) 850static void _write_failed(struct inode *inode, loff_t to)
718{ 851{
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
818const struct address_space_operations exofs_aops = { 951const struct address_space_operations exofs_aops = {
819 .readpage = exofs_readpage, 952 .readpage = exofs_readpage,
820 .readpages = exofs_readpages, 953 .readpages = exofs_readpages,
821 .writepage = exofs_writepage, 954 .writepage = NULL,
822 .writepages = exofs_writepages, 955 .writepages = exofs_writepages,
823 .write_begin = exofs_write_begin_export, 956 .write_begin = exofs_write_begin_export,
824 .write_end = exofs_write_end, 957 .write_end = exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
860 993
861 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 994 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
862 995
863 ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); 996 ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
864 if (likely(!ret)) 997 if (likely(!ret))
865 truncate_setsize(inode, newsize); 998 truncate_setsize(inode, newsize);
866 999
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
927 struct exofs_on_disk_inode_layout *layout; 1060 struct exofs_on_disk_inode_layout *layout;
928 int ret; 1061 int ret;
929 1062
930 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1063 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
931 if (unlikely(ret)) { 1064 if (unlikely(ret)) {
932 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 1065 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
933 return ret; 1066 return ret;
934 } 1067 }
935 1068
936 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); 1069 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
937 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); 1070 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
938 1071
939 ios->in_attr = attrs; 1072 ios->in_attr = attrs;
940 ios->in_attr_len = ARRAY_SIZE(attrs); 1073 ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1018 return inode; 1151 return inode;
1019 oi = exofs_i(inode); 1152 oi = exofs_i(inode);
1020 __oi_init(oi); 1153 __oi_init(oi);
1021 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, 1154 exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
1022 exofs_oi_objno(oi)); 1155 exofs_oi_objno(oi));
1023 1156
1024 /* read the inode from the osd */ 1157 /* read the inode from the osd */
@@ -1032,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1032 inode->i_mode = le16_to_cpu(fcb.i_mode); 1165 inode->i_mode = le16_to_cpu(fcb.i_mode);
1033 inode->i_uid = le32_to_cpu(fcb.i_uid); 1166 inode->i_uid = le32_to_cpu(fcb.i_uid);
1034 inode->i_gid = le32_to_cpu(fcb.i_gid); 1167 inode->i_gid = le32_to_cpu(fcb.i_gid);
1035 inode->i_nlink = le16_to_cpu(fcb.i_links_count); 1168 set_nlink(inode, le16_to_cpu(fcb.i_links_count));
1036 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); 1169 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
1037 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); 1170 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
1038 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); 1171 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1172 spin_unlock(&sbi->s_next_gen_lock); 1305 spin_unlock(&sbi->s_next_gen_lock);
1173 insert_inode_hash(inode); 1306 insert_inode_hash(inode);
1174 1307
1175 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, 1308 exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
1176 exofs_oi_objno(oi)); 1309 exofs_oi_objno(oi));
1177 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ 1310 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1178 1311
1179 mark_inode_dirty(inode); 1312 mark_inode_dirty(inode);
1180 1313
1181 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1314 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1182 if (unlikely(ret)) { 1315 if (unlikely(ret)) {
1183 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); 1316 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
1184 return ERR_PTR(ret); 1317 return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1267 } else 1400 } else
1268 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1401 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1269 1402
1270 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1403 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1271 if (unlikely(ret)) { 1404 if (unlikely(ret)) {
1272 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 1405 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
1273 goto free_args; 1406 goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
1350 /* ignore the error, attempt a remove anyway */ 1483 /* ignore the error, attempt a remove anyway */
1351 1484
1352 /* Now Remove the OSD objects */ 1485 /* Now Remove the OSD objects */
1353 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1486 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1354 if (unlikely(ret)) { 1487 if (unlikely(ret)) {
1355 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); 1488 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
1356 return; 1489 return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af8819..d271ad83720 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,77 +23,289 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/module.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
28#include <linux/lcm.h>
27 29
28#include <scsi/osd_ore.h> 30#include "ore_raid.h"
29 31
30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) 32MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
33MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
34MODULE_LICENSE("GPL");
35
36/* ore_verify_layout does a couple of things:
37 * 1. Given a minimum number of needed parameters fixes up the rest of the
38 * members to be operatonals for the ore. The needed parameters are those
39 * that are defined by the pnfs-objects layout STD.
40 * 2. Check to see if the current ore code actually supports these parameters
41 * for example stripe_unit must be a multple of the system PAGE_SIZE,
42 * and etc...
43 * 3. Cache some havily used calculations that will be needed by users.
44 */
45
46enum { BIO_MAX_PAGES_KMALLOC =
47 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
31 48
32#ifdef CONFIG_EXOFS_DEBUG 49int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
33#define ORE_DBGMSG(fmt, a...) \ 50{
34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) 51 u64 stripe_length;
35#else 52
36#define ORE_DBGMSG(fmt, a...) \ 53 switch (layout->raid_algorithm) {
37 do { if (0) printk(fmt, ##a); } while (0) 54 case PNFS_OSD_RAID_0:
38#endif 55 layout->parity = 0;
56 break;
57 case PNFS_OSD_RAID_5:
58 layout->parity = 1;
59 break;
60 case PNFS_OSD_RAID_PQ:
61 case PNFS_OSD_RAID_4:
62 default:
63 ORE_ERR("Only RAID_0/5 for now\n");
64 return -EINVAL;
65 }
66 if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
67 ORE_ERR("Stripe Unit(0x%llx)"
68 " must be Multples of PAGE_SIZE(0x%lx)\n",
69 _LLU(layout->stripe_unit), PAGE_SIZE);
70 return -EINVAL;
71 }
72 if (layout->group_width) {
73 if (!layout->group_depth) {
74 ORE_ERR("group_depth == 0 && group_width != 0\n");
75 return -EINVAL;
76 }
77 if (total_comps < (layout->group_width * layout->mirrors_p1)) {
78 ORE_ERR("Data Map wrong, "
79 "numdevs=%d < group_width=%d * mirrors=%d\n",
80 total_comps, layout->group_width,
81 layout->mirrors_p1);
82 return -EINVAL;
83 }
84 layout->group_count = total_comps / layout->mirrors_p1 /
85 layout->group_width;
86 } else {
87 if (layout->group_depth) {
88 printk(KERN_NOTICE "Warning: group_depth ignored "
89 "group_width == 0 && group_depth == %lld\n",
90 _LLU(layout->group_depth));
91 }
92 layout->group_width = total_comps / layout->mirrors_p1;
93 layout->group_depth = -1;
94 layout->group_count = 1;
95 }
39 96
40/* u64 has problems with printk this will cast it to unsigned long long */ 97 stripe_length = (u64)layout->group_width * layout->stripe_unit;
41#define _LLU(x) (unsigned long long)(x) 98 if (stripe_length >= (1ULL << 32)) {
99 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
100 _LLU(stripe_length));
101 return -EINVAL;
102 }
42 103
43#define ORE_DBGMSG2(M...) do {} while (0) 104 layout->max_io_length =
44/* #define ORE_DBGMSG2 ORE_DBGMSG */ 105 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
106 layout->group_width;
107 if (layout->parity) {
108 unsigned stripe_length =
109 (layout->group_width - layout->parity) *
110 layout->stripe_unit;
45 111
46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 112 layout->max_io_length /= stripe_length;
47MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 113 layout->max_io_length *= stripe_length;
48MODULE_LICENSE("GPL"); 114 }
115 return 0;
116}
117EXPORT_SYMBOL(ore_verify_layout);
49 118
50static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) 119static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
51{ 120{
52 return ios->comps->comps[index & ios->comps->single_comp].cred; 121 return ios->oc->comps[index & ios->oc->single_comp].cred;
53} 122}
54 123
55static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) 124static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
56{ 125{
57 return &ios->comps->comps[index & ios->comps->single_comp].obj; 126 return &ios->oc->comps[index & ios->oc->single_comp].obj;
58} 127}
59 128
60static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) 129static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
61{ 130{
62 return ios->comps->ods[index]; 131 ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
132 ios->oc->first_dev, ios->oc->numdevs, index,
133 ios->oc->ods);
134
135 return ore_comp_dev(ios->oc, index);
63} 136}
64 137
65int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, 138int _ore_get_io_state(struct ore_layout *layout,
139 struct ore_components *oc, unsigned numdevs,
140 unsigned sgs_per_dev, unsigned num_par_pages,
141 struct ore_io_state **pios)
142{
143 struct ore_io_state *ios;
144 struct page **pages;
145 struct osd_sg_entry *sgilist;
146 struct __alloc_all_io_state {
147 struct ore_io_state ios;
148 struct ore_per_dev_state per_dev[numdevs];
149 union {
150 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
151 struct page *pages[num_par_pages];
152 };
153 } *_aios;
154
155 if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
156 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
157 if (unlikely(!_aios)) {
158 ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
159 sizeof(*_aios));
160 *pios = NULL;
161 return -ENOMEM;
162 }
163 pages = num_par_pages ? _aios->pages : NULL;
164 sgilist = sgs_per_dev ? _aios->sglist : NULL;
165 ios = &_aios->ios;
166 } else {
167 struct __alloc_small_io_state {
168 struct ore_io_state ios;
169 struct ore_per_dev_state per_dev[numdevs];
170 } *_aio_small;
171 union __extra_part {
172 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
173 struct page *pages[num_par_pages];
174 } *extra_part;
175
176 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
177 if (unlikely(!_aio_small)) {
178 ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
179 sizeof(*_aio_small));
180 *pios = NULL;
181 return -ENOMEM;
182 }
183 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
184 if (unlikely(!extra_part)) {
185 ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
186 sizeof(*extra_part));
187 kfree(_aio_small);
188 *pios = NULL;
189 return -ENOMEM;
190 }
191
192 pages = num_par_pages ? extra_part->pages : NULL;
193 sgilist = sgs_per_dev ? extra_part->sglist : NULL;
194 /* In this case the per_dev[0].sgilist holds the pointer to
195 * be freed
196 */
197 ios = &_aio_small->ios;
198 ios->extra_part_alloc = true;
199 }
200
201 if (pages) {
202 ios->parity_pages = pages;
203 ios->max_par_pages = num_par_pages;
204 }
205 if (sgilist) {
206 unsigned d;
207
208 for (d = 0; d < numdevs; ++d) {
209 ios->per_dev[d].sglist = sgilist;
210 sgilist += sgs_per_dev;
211 }
212 ios->sgs_per_dev = sgs_per_dev;
213 }
214
215 ios->layout = layout;
216 ios->oc = oc;
217 *pios = ios;
218 return 0;
219}
220
221/* Allocate an io_state for only a single group of devices
222 *
223 * If a user needs to call ore_read/write() this version must be used becase it
224 * allocates extra stuff for striping and raid.
225 * The ore might decide to only IO less then @length bytes do to alignmets
226 * and constrains as follows:
227 * - The IO cannot cross group boundary.
228 * - In raid5/6 The end of the IO must align at end of a stripe eg.
229 * (@offset + @length) % strip_size == 0. Or the complete range is within a
230 * single stripe.
231 * - Memory condition only permitted a shorter IO. (A user can use @length=~0
232 * And check the returned ios->length for max_io_size.)
233 *
234 * The caller must check returned ios->length (and/or ios->nr_pages) and
235 * re-issue these pages that fall outside of ios->length
236 */
237int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
66 bool is_reading, u64 offset, u64 length, 238 bool is_reading, u64 offset, u64 length,
67 struct ore_io_state **pios) 239 struct ore_io_state **pios)
68{ 240{
69 struct ore_io_state *ios; 241 struct ore_io_state *ios;
242 unsigned numdevs = layout->group_width * layout->mirrors_p1;
243 unsigned sgs_per_dev = 0, max_par_pages = 0;
244 int ret;
70 245
71 /*TODO: Maybe use kmem_cach per sbi of size 246 if (layout->parity && length) {
72 * exofs_io_state_size(layout->s_numdevs) 247 unsigned data_devs = layout->group_width - layout->parity;
73 */ 248 unsigned stripe_size = layout->stripe_unit * data_devs;
74 ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); 249 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
75 if (unlikely(!ios)) { 250 u32 remainder;
76 ORE_DBGMSG("Failed kzalloc bytes=%d\n", 251 u64 num_stripes;
77 ore_io_state_size(comps->numdevs)); 252 u64 num_raid_units;
78 *pios = NULL; 253
79 return -ENOMEM; 254 num_stripes = div_u64_rem(length, stripe_size, &remainder);
255 if (remainder)
256 ++num_stripes;
257
258 num_raid_units = num_stripes * layout->parity;
259
260 if (is_reading) {
261 /* For reads add per_dev sglist array */
262 /* TODO: Raid 6 we need twice more. Actually:
263 * num_stripes / LCMdP(W,P);
264 * if (W%P != 0) num_stripes *= parity;
265 */
266
267 /* first/last seg is split */
268 num_raid_units += layout->group_width;
269 sgs_per_dev = div_u64(num_raid_units, data_devs);
270 } else {
271 /* For Writes add parity pages array. */
272 max_par_pages = num_raid_units * pages_in_unit *
273 sizeof(struct page *);
274 }
80 } 275 }
81 276
82 ios->layout = layout; 277 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
83 ios->comps = comps; 278 pios);
84 ios->offset = offset; 279 if (unlikely(ret))
85 ios->length = length; 280 return ret;
281
282 ios = *pios;
86 ios->reading = is_reading; 283 ios->reading = is_reading;
284 ios->offset = offset;
285
286 if (length) {
287 ore_calc_stripe_info(layout, offset, length, &ios->si);
288 ios->length = ios->si.length;
289 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
290 if (layout->parity)
291 _ore_post_alloc_raid_stuff(ios);
292 }
87 293
88 *pios = ios;
89 return 0; 294 return 0;
90} 295}
91EXPORT_SYMBOL(ore_get_rw_state); 296EXPORT_SYMBOL(ore_get_rw_state);
92 297
93int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, 298/* Allocate an io_state for all the devices in the comps array
94 struct ore_io_state **ios) 299 *
300 * This version of io_state allocation is used mostly by create/remove
301 * and trunc where we currently need all the devices. The only wastful
302 * bit is the read/write_attributes with no IO. Those sites should
303 * be converted to use ore_get_rw_state() with length=0
304 */
305int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
306 struct ore_io_state **pios)
95{ 307{
96 return ore_get_rw_state(layout, comps, true, 0, 0, ios); 308 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
97} 309}
98EXPORT_SYMBOL(ore_get_io_state); 310EXPORT_SYMBOL(ore_get_io_state);
99 311
@@ -111,6 +323,7 @@ void ore_put_io_state(struct ore_io_state *ios)
111 bio_put(per_dev->bio); 323 bio_put(per_dev->bio);
112 } 324 }
113 325
326 _ore_free_raid_stuff(ios);
114 kfree(ios); 327 kfree(ios);
115 } 328 }
116} 329}
@@ -138,7 +351,7 @@ static void _done_io(struct osd_request *or, void *p)
138 kref_put(&ios->kref, _last_io); 351 kref_put(&ios->kref, _last_io);
139} 352}
140 353
141static int ore_io_execute(struct ore_io_state *ios) 354int ore_io_execute(struct ore_io_state *ios)
142{ 355{
143 DECLARE_COMPLETION_ONSTACK(wait); 356 DECLARE_COMPLETION_ONSTACK(wait);
144 bool sync = (ios->done == NULL); 357 bool sync = (ios->done == NULL);
@@ -198,7 +411,7 @@ static void _clear_bio(struct bio *bio)
198 } 411 }
199} 412}
200 413
201int ore_check_io(struct ore_io_state *ios, u64 *resid) 414int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
202{ 415{
203 enum osd_err_priority acumulated_osd_err = 0; 416 enum osd_err_priority acumulated_osd_err = 0;
204 int acumulated_lin_err = 0; 417 int acumulated_lin_err = 0;
@@ -206,7 +419,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
206 419
207 for (i = 0; i < ios->numdevs; i++) { 420 for (i = 0; i < ios->numdevs; i++) {
208 struct osd_sense_info osi; 421 struct osd_sense_info osi;
209 struct osd_request *or = ios->per_dev[i].or; 422 struct ore_per_dev_state *per_dev = &ios->per_dev[i];
423 struct osd_request *or = per_dev->or;
210 int ret; 424 int ret;
211 425
212 if (unlikely(!or)) 426 if (unlikely(!or))
@@ -218,29 +432,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
218 432
219 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 433 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
220 /* start read offset passed endof file */ 434 /* start read offset passed endof file */
221 _clear_bio(ios->per_dev[i].bio); 435 _clear_bio(per_dev->bio);
222 ORE_DBGMSG("start read offset passed end of file " 436 ORE_DBGMSG("start read offset passed end of file "
223 "offset=0x%llx, length=0x%llx\n", 437 "offset=0x%llx, length=0x%llx\n",
224 _LLU(ios->per_dev[i].offset), 438 _LLU(per_dev->offset),
225 _LLU(ios->per_dev[i].length)); 439 _LLU(per_dev->length));
226 440
227 continue; /* we recovered */ 441 continue; /* we recovered */
228 } 442 }
229 443
444 if (on_dev_error) {
445 u64 residual = ios->reading ?
446 or->in.residual : or->out.residual;
447 u64 offset = (ios->offset + ios->length) - residual;
448 struct ore_dev *od = ios->oc->ods[
449 per_dev->dev - ios->oc->first_dev];
450
451 on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
452 offset, residual);
453 }
230 if (osi.osd_err_pri >= acumulated_osd_err) { 454 if (osi.osd_err_pri >= acumulated_osd_err) {
231 acumulated_osd_err = osi.osd_err_pri; 455 acumulated_osd_err = osi.osd_err_pri;
232 acumulated_lin_err = ret; 456 acumulated_lin_err = ret;
233 } 457 }
234 } 458 }
235 459
236 /* TODO: raid specific residual calculations */
237 if (resid) {
238 if (likely(!acumulated_lin_err))
239 *resid = 0;
240 else
241 *resid = ios->length;
242 }
243
244 return acumulated_lin_err; 460 return acumulated_lin_err;
245} 461}
246EXPORT_SYMBOL(ore_check_io); 462EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +464,65 @@ EXPORT_SYMBOL(ore_check_io);
248/* 464/*
249 * L - logical offset into the file 465 * L - logical offset into the file
250 * 466 *
251 * U - The number of bytes in a stripe within a group 467 * D - number of Data devices
468 * D = group_width - parity
252 * 469 *
253 * U = stripe_unit * group_width 470 * U - The number of bytes in a stripe within a group
471 * U = stripe_unit * D
254 * 472 *
255 * T - The number of bytes striped within a group of component objects 473 * T - The number of bytes striped within a group of component objects
256 * (before advancing to the next group) 474 * (before advancing to the next group)
257 * 475 * T = U * group_depth
258 * T = stripe_unit * group_width * group_depth
259 * 476 *
260 * S - The number of bytes striped across all component objects 477 * S - The number of bytes striped across all component objects
261 * before the pattern repeats 478 * before the pattern repeats
479 * S = T * group_count
262 * 480 *
263 * S = stripe_unit * group_width * group_depth * group_count 481 * M - The "major" (i.e., across all components) cycle number
264 *
265 * M - The "major" (i.e., across all components) stripe number
266 *
267 * M = L / S 482 * M = L / S
268 * 483 *
269 * G - Counts the groups from the beginning of the major stripe 484 * G - Counts the groups from the beginning of the major cycle
270 *
271 * G = (L - (M * S)) / T [or (L % S) / T] 485 * G = (L - (M * S)) / T [or (L % S) / T]
272 * 486 *
273 * H - The byte offset within the group 487 * H - The byte offset within the group
274 *
275 * H = (L - (M * S)) % T [or (L % S) % T] 488 * H = (L - (M * S)) % T [or (L % S) % T]
276 * 489 *
277 * N - The "minor" (i.e., across the group) stripe number 490 * N - The "minor" (i.e., across the group) stripe number
278 *
279 * N = H / U 491 * N = H / U
280 * 492 *
281 * C - The component index coresponding to L 493 * C - The component index coresponding to L
282 * 494 *
283 * C = (H - (N * U)) / stripe_unit + G * group_width 495 * C = (H - (N * U)) / stripe_unit + G * D
284 * [or (L % U) / stripe_unit + G * group_width] 496 * [or (L % U) / stripe_unit + G * D]
285 * 497 *
286 * O - The component offset coresponding to L 498 * O - The component offset coresponding to L
287 *
288 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 499 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
500 *
501 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
502 * divide by parity
503 * LCMdP = lcm(group_width, parity) / parity
504 *
505 * R - The parity Rotation stripe
506 * (Note parity cycle always starts at a group's boundary)
507 * R = N % LCMdP
508 *
509 * I = the first parity device index
510 * I = (group_width + group_width - R*parity - parity) % group_width
511 *
512 * Craid - The component index Rotated
513 * Craid = (group_width + C - R*parity) % group_width
514 * (We add the group_width to avoid negative numbers modulo math)
289 */ 515 */
290struct _striping_info { 516void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
291 u64 obj_offset; 517 u64 length, struct ore_striping_info *si)
292 u64 group_length;
293 u64 M; /* for truncate */
294 unsigned dev;
295 unsigned unit_off;
296};
297
298static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
299 struct _striping_info *si)
300{ 518{
301 u32 stripe_unit = layout->stripe_unit; 519 u32 stripe_unit = layout->stripe_unit;
302 u32 group_width = layout->group_width; 520 u32 group_width = layout->group_width;
303 u64 group_depth = layout->group_depth; 521 u64 group_depth = layout->group_depth;
522 u32 parity = layout->parity;
304 523
305 u32 U = stripe_unit * group_width; 524 u32 D = group_width - parity;
525 u32 U = D * stripe_unit;
306 u64 T = U * group_depth; 526 u64 T = U * group_depth;
307 u64 S = T * layout->group_count; 527 u64 S = T * layout->group_count;
308 u64 M = div64_u64(file_offset, S); 528 u64 M = div64_u64(file_offset, S);
@@ -318,39 +538,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
318 u32 N = div_u64(H, U); 538 u32 N = div_u64(H, U);
319 539
320 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 540 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
321 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 541 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
322 si->dev *= layout->mirrors_p1;
323 542
324 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 543 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
325 544
326 si->obj_offset = si->unit_off + (N * stripe_unit) + 545 si->obj_offset = si->unit_off + (N * stripe_unit) +
327 (M * group_depth * stripe_unit); 546 (M * group_depth * stripe_unit);
328 547
329 si->group_length = T - H; 548 if (parity) {
549 u32 LCMdP = lcm(group_width, parity) / parity;
550 /* R = N % LCMdP; */
551 u32 RxP = (N % LCMdP) * parity;
552 u32 first_dev = C - C % group_width;
553
554 si->par_dev = (group_width + group_width - parity - RxP) %
555 group_width + first_dev;
556 si->dev = (group_width + C - RxP) % group_width + first_dev;
557 si->bytes_in_stripe = U;
558 si->first_stripe_start = M * S + G * T + N * U;
559 } else {
560 /* Make the math correct see _prepare_one_group */
561 si->par_dev = group_width;
562 si->dev = C;
563 }
564
565 si->dev *= layout->mirrors_p1;
566 si->par_dev *= layout->mirrors_p1;
567 si->offset = file_offset;
568 si->length = T - H;
569 if (si->length > length)
570 si->length = length;
330 si->M = M; 571 si->M = M;
331} 572}
573EXPORT_SYMBOL(ore_calc_stripe_info);
332 574
333static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 575int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
334 unsigned pgbase, struct ore_per_dev_state *per_dev, 576 unsigned pgbase, struct page **pages,
335 int cur_len) 577 struct ore_per_dev_state *per_dev, int cur_len)
336{ 578{
337 unsigned pg = *cur_pg; 579 unsigned pg = *cur_pg;
338 struct request_queue *q = 580 struct request_queue *q =
339 osd_request_queue(_ios_od(ios, per_dev->dev)); 581 osd_request_queue(_ios_od(ios, per_dev->dev));
340 582 unsigned len = cur_len;
341 per_dev->length += cur_len; 583 int ret;
342 584
343 if (per_dev->bio == NULL) { 585 if (per_dev->bio == NULL) {
344 unsigned pages_in_stripe = ios->layout->group_width * 586 unsigned pages_in_stripe = ios->layout->group_width *
345 (ios->layout->stripe_unit / PAGE_SIZE); 587 (ios->layout->stripe_unit / PAGE_SIZE);
346 unsigned bio_size = (ios->nr_pages + pages_in_stripe) / 588 unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
347 ios->layout->group_width; 589 (ios->layout->group_width -
590 ios->layout->parity);
591 unsigned bio_size = (nr_pages + pages_in_stripe) /
592 ios->layout->group_width;
348 593
349 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 594 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
350 if (unlikely(!per_dev->bio)) { 595 if (unlikely(!per_dev->bio)) {
351 ORE_DBGMSG("Failed to allocate BIO size=%u\n", 596 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
352 bio_size); 597 bio_size);
353 return -ENOMEM; 598 ret = -ENOMEM;
599 goto out;
354 } 600 }
355 } 601 }
356 602
@@ -358,64 +604,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
358 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 604 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
359 unsigned added_len; 605 unsigned added_len;
360 606
361 BUG_ON(ios->nr_pages <= pg);
362 cur_len -= pglen; 607 cur_len -= pglen;
363 608
364 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], 609 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
365 pglen, pgbase); 610 pglen, pgbase);
366 if (unlikely(pglen != added_len)) 611 if (unlikely(pglen != added_len)) {
367 return -ENOMEM; 612 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
613 per_dev->bio->bi_vcnt);
614 ret = -ENOMEM;
615 goto out;
616 }
617 _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
618
368 pgbase = 0; 619 pgbase = 0;
369 ++pg; 620 ++pg;
370 } 621 }
371 BUG_ON(cur_len); 622 BUG_ON(cur_len);
372 623
624 per_dev->length += len;
373 *cur_pg = pg; 625 *cur_pg = pg;
374 return 0; 626 ret = 0;
627out: /* we fail the complete unit on an error eg don't advance
628 * per_dev->length and cur_pg. This means that we might have a bigger
629 * bio than the CDB requested length (per_dev->length). That's fine
630 * only the oposite is fatal.
631 */
632 return ret;
375} 633}
376 634
377static int _prepare_one_group(struct ore_io_state *ios, u64 length, 635static int _prepare_for_striping(struct ore_io_state *ios)
378 struct _striping_info *si)
379{ 636{
637 struct ore_striping_info *si = &ios->si;
380 unsigned stripe_unit = ios->layout->stripe_unit; 638 unsigned stripe_unit = ios->layout->stripe_unit;
381 unsigned mirrors_p1 = ios->layout->mirrors_p1; 639 unsigned mirrors_p1 = ios->layout->mirrors_p1;
382 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 640 unsigned group_width = ios->layout->group_width;
641 unsigned devs_in_group = group_width * mirrors_p1;
383 unsigned dev = si->dev; 642 unsigned dev = si->dev;
384 unsigned first_dev = dev - (dev % devs_in_group); 643 unsigned first_dev = dev - (dev % devs_in_group);
385 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; 644 unsigned dev_order;
386 unsigned cur_pg = ios->pages_consumed; 645 unsigned cur_pg = ios->pages_consumed;
646 u64 length = ios->length;
387 int ret = 0; 647 int ret = 0;
388 648
649 if (!ios->pages) {
650 ios->numdevs = ios->layout->mirrors_p1;
651 return 0;
652 }
653
654 BUG_ON(length > si->length);
655
656 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
657 si->cur_comp = dev_order;
658 si->cur_pg = si->unit_off / PAGE_SIZE;
659
389 while (length) { 660 while (length) {
390 struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; 661 unsigned comp = dev - first_dev;
662 struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
391 unsigned cur_len, page_off = 0; 663 unsigned cur_len, page_off = 0;
392 664
393 if (!per_dev->length) { 665 if (!per_dev->length) {
394 per_dev->dev = dev; 666 per_dev->dev = dev;
395 if (dev < si->dev) { 667 if (dev == si->dev) {
396 per_dev->offset = si->obj_offset + stripe_unit - 668 WARN_ON(dev == si->par_dev);
397 si->unit_off;
398 cur_len = stripe_unit;
399 } else if (dev == si->dev) {
400 per_dev->offset = si->obj_offset; 669 per_dev->offset = si->obj_offset;
401 cur_len = stripe_unit - si->unit_off; 670 cur_len = stripe_unit - si->unit_off;
402 page_off = si->unit_off & ~PAGE_MASK; 671 page_off = si->unit_off & ~PAGE_MASK;
403 BUG_ON(page_off && (page_off != ios->pgbase)); 672 BUG_ON(page_off && (page_off != ios->pgbase));
404 } else { /* dev > si->dev */ 673 } else {
405 per_dev->offset = si->obj_offset - si->unit_off; 674 if (si->cur_comp > dev_order)
675 per_dev->offset =
676 si->obj_offset - si->unit_off;
677 else /* si->cur_comp < dev_order */
678 per_dev->offset =
679 si->obj_offset + stripe_unit -
680 si->unit_off;
406 cur_len = stripe_unit; 681 cur_len = stripe_unit;
407 } 682 }
408
409 if (max_comp < dev)
410 max_comp = dev;
411 } else { 683 } else {
412 cur_len = stripe_unit; 684 cur_len = stripe_unit;
413 } 685 }
414 if (cur_len >= length) 686 if (cur_len >= length)
415 cur_len = length; 687 cur_len = length;
416 688
417 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, 689 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
418 cur_len); 690 per_dev, cur_len);
419 if (unlikely(ret)) 691 if (unlikely(ret))
420 goto out; 692 goto out;
421 693
@@ -423,60 +695,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
423 dev = (dev % devs_in_group) + first_dev; 695 dev = (dev % devs_in_group) + first_dev;
424 696
425 length -= cur_len; 697 length -= cur_len;
426 }
427out:
428 ios->numdevs = max_comp + mirrors_p1;
429 ios->pages_consumed = cur_pg;
430 return ret;
431}
432
433static int _prepare_for_striping(struct ore_io_state *ios)
434{
435 u64 length = ios->length;
436 u64 offset = ios->offset;
437 struct _striping_info si;
438 int ret = 0;
439 698
440 if (!ios->pages) { 699 si->cur_comp = (si->cur_comp + 1) % group_width;
441 if (ios->kern_buff) { 700 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
442 struct ore_per_dev_state *per_dev = &ios->per_dev[0]; 701 if (!length && ios->sp2d) {
702 /* If we are writing and this is the very last
703 * stripe. then operate on parity dev.
704 */
705 dev = si->par_dev;
706 }
707 if (ios->sp2d)
708 /* In writes cur_len just means if it's the
709 * last one. See _ore_add_parity_unit.
710 */
711 cur_len = length;
712 per_dev = &ios->per_dev[dev - first_dev];
713 if (!per_dev->length) {
714 /* Only/always the parity unit of the first
715 * stripe will be empty. So this is a chance to
716 * initialize the per_dev info.
717 */
718 per_dev->dev = dev;
719 per_dev->offset = si->obj_offset - si->unit_off;
720 }
443 721
444 _calc_stripe_info(ios->layout, ios->offset, &si); 722 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
445 per_dev->offset = si.obj_offset; 723 if (unlikely(ret))
446 per_dev->dev = si.dev; 724 goto out;
447 725
448 /* no cross device without page array */ 726 /* Rotate next par_dev backwards with wraping */
449 BUG_ON((ios->layout->group_width > 1) && 727 si->par_dev = (devs_in_group + si->par_dev -
450 (si.unit_off + ios->length > 728 ios->layout->parity * mirrors_p1) %
451 ios->layout->stripe_unit)); 729 devs_in_group + first_dev;
730 /* Next stripe, start fresh */
731 si->cur_comp = 0;
732 si->cur_pg = 0;
452 } 733 }
453 ios->numdevs = ios->layout->mirrors_p1;
454 return 0;
455 }
456
457 while (length) {
458 _calc_stripe_info(ios->layout, offset, &si);
459
460 if (length < si.group_length)
461 si.group_length = length;
462
463 ret = _prepare_one_group(ios, si.group_length, &si);
464 if (unlikely(ret))
465 goto out;
466
467 offset += si.group_length;
468 length -= si.group_length;
469 } 734 }
470
471out: 735out:
472 return ret; 736 ios->numdevs = devs_in_group;
737 ios->pages_consumed = cur_pg;
738 if (unlikely(ret)) {
739 if (length == ios->length)
740 return ret;
741 else
742 ios->length -= length;
743 }
744 return 0;
473} 745}
474 746
475int ore_create(struct ore_io_state *ios) 747int ore_create(struct ore_io_state *ios)
476{ 748{
477 int i, ret; 749 int i, ret;
478 750
479 for (i = 0; i < ios->comps->numdevs; i++) { 751 for (i = 0; i < ios->oc->numdevs; i++) {
480 struct osd_request *or; 752 struct osd_request *or;
481 753
482 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 754 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +773,7 @@ int ore_remove(struct ore_io_state *ios)
501{ 773{
502 int i, ret; 774 int i, ret;
503 775
504 for (i = 0; i < ios->comps->numdevs; i++) { 776 for (i = 0; i < ios->oc->numdevs; i++) {
505 struct osd_request *or; 777 struct osd_request *or;
506 778
507 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 779 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +815,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
543 goto out; 815 goto out;
544 } 816 }
545 per_dev->or = or; 817 per_dev->or = or;
546 per_dev->offset = master_dev->offset;
547 818
548 if (ios->pages) { 819 if (ios->pages) {
549 struct bio *bio; 820 struct bio *bio;
@@ -562,6 +833,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
562 __bio_clone(bio, master_dev->bio); 833 __bio_clone(bio, master_dev->bio);
563 bio->bi_bdev = NULL; 834 bio->bi_bdev = NULL;
564 bio->bi_next = NULL; 835 bio->bi_next = NULL;
836 per_dev->offset = master_dev->offset;
565 per_dev->length = master_dev->length; 837 per_dev->length = master_dev->length;
566 per_dev->bio = bio; 838 per_dev->bio = bio;
567 per_dev->dev = dev; 839 per_dev->dev = dev;
@@ -579,7 +851,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
579 _LLU(per_dev->offset), 851 _LLU(per_dev->offset),
580 _LLU(per_dev->length), dev); 852 _LLU(per_dev->length), dev);
581 } else if (ios->kern_buff) { 853 } else if (ios->kern_buff) {
582 ret = osd_req_write_kern(or, _ios_obj(ios, dev), 854 per_dev->offset = ios->si.obj_offset;
855 per_dev->dev = ios->si.dev + dev;
856
857 /* no cross device without page array */
858 BUG_ON((ios->layout->group_width > 1) &&
859 (ios->si.unit_off + ios->length >
860 ios->layout->stripe_unit));
861
862 ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
583 per_dev->offset, 863 per_dev->offset,
584 ios->kern_buff, ios->length); 864 ios->kern_buff, ios->length);
585 if (unlikely(ret)) 865 if (unlikely(ret))
@@ -588,7 +868,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
588 "length=0x%llx dev=%d\n", 868 "length=0x%llx dev=%d\n",
589 _LLU(_ios_obj(ios, dev)->id), 869 _LLU(_ios_obj(ios, dev)->id),
590 _LLU(per_dev->offset), 870 _LLU(per_dev->offset),
591 _LLU(ios->length), dev); 871 _LLU(ios->length), per_dev->dev);
592 } else { 872 } else {
593 osd_req_set_attributes(or, _ios_obj(ios, dev)); 873 osd_req_set_attributes(or, _ios_obj(ios, dev));
594 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 874 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +894,14 @@ int ore_write(struct ore_io_state *ios)
614 int i; 894 int i;
615 int ret; 895 int ret;
616 896
897 if (unlikely(ios->sp2d && !ios->r4w)) {
898 /* A library is attempting a RAID-write without providing
899 * a pages lock interface.
900 */
901 WARN_ON_ONCE(1);
902 return -ENOTSUPP;
903 }
904
617 ret = _prepare_for_striping(ios); 905 ret = _prepare_for_striping(ios);
618 if (unlikely(ret)) 906 if (unlikely(ret))
619 return ret; 907 return ret;
@@ -629,7 +917,7 @@ int ore_write(struct ore_io_state *ios)
629} 917}
630EXPORT_SYMBOL(ore_write); 918EXPORT_SYMBOL(ore_write);
631 919
632static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) 920int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
633{ 921{
634 struct osd_request *or; 922 struct osd_request *or;
635 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 923 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +936,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
648 per_dev->or = or; 936 per_dev->or = or;
649 937
650 if (ios->pages) { 938 if (ios->pages) {
651 osd_req_read(or, obj, per_dev->offset, 939 if (per_dev->cur_sg) {
652 per_dev->bio, per_dev->length); 940 /* finalize the last sg_entry */
941 _ore_add_sg_seg(per_dev, 0, false);
942 if (unlikely(!per_dev->cur_sg))
943 return 0; /* Skip parity only device */
944
945 osd_req_read_sg(or, obj, per_dev->bio,
946 per_dev->sglist, per_dev->cur_sg);
947 } else {
948 /* The no raid case */
949 osd_req_read(or, obj, per_dev->offset,
950 per_dev->bio, per_dev->length);
951 }
952
653 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 953 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
654 " dev=%d\n", _LLU(obj->id), 954 " dev=%d sg_len=%d\n", _LLU(obj->id),
655 _LLU(per_dev->offset), _LLU(per_dev->length), 955 _LLU(per_dev->offset), _LLU(per_dev->length),
656 first_dev); 956 first_dev, per_dev->cur_sg);
657 } else if (ios->kern_buff) {
658 int ret = osd_req_read_kern(or, obj, per_dev->offset,
659 ios->kern_buff, ios->length);
660 ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
661 "length=0x%llx dev=%d ret=>%d\n",
662 _LLU(obj->id), _LLU(per_dev->offset),
663 _LLU(ios->length), first_dev, ret);
664 if (unlikely(ret))
665 return ret;
666 } else { 957 } else {
958 BUG_ON(ios->kern_buff);
959
667 osd_req_get_attributes(or, obj); 960 osd_req_get_attributes(or, obj);
668 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 961 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
669 _LLU(obj->id), 962 _LLU(obj->id),
@@ -688,7 +981,7 @@ int ore_read(struct ore_io_state *ios)
688 return ret; 981 return ret;
689 982
690 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 983 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
691 ret = _read_mirror(ios, i); 984 ret = _ore_read_mirror(ios, i);
692 if (unlikely(ret)) 985 if (unlikely(ret))
693 return ret; 986 return ret;
694 } 987 }
@@ -744,31 +1037,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
744} 1037}
745 1038
746struct _trunc_info { 1039struct _trunc_info {
747 struct _striping_info si; 1040 struct ore_striping_info si;
748 u64 prev_group_obj_off; 1041 u64 prev_group_obj_off;
749 u64 next_group_obj_off; 1042 u64 next_group_obj_off;
750 1043
751 unsigned first_group_dev; 1044 unsigned first_group_dev;
752 unsigned nex_group_dev; 1045 unsigned nex_group_dev;
753 unsigned max_devs;
754}; 1046};
755 1047
756void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, 1048static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
757 struct _trunc_info *ti) 1049 struct _trunc_info *ti)
758{ 1050{
759 unsigned stripe_unit = layout->stripe_unit; 1051 unsigned stripe_unit = layout->stripe_unit;
760 1052
761 _calc_stripe_info(layout, file_offset, &ti->si); 1053 ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
762 1054
763 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1055 ti->prev_group_obj_off = ti->si.M * stripe_unit;
764 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1056 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
765 1057
766 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); 1058 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
767 ti->nex_group_dev = ti->first_group_dev + layout->group_width; 1059 ti->nex_group_dev = ti->first_group_dev + layout->group_width;
768 ti->max_devs = layout->group_width * layout->group_count;
769} 1060}
770 1061
771int ore_truncate(struct ore_layout *layout, struct ore_components *comps, 1062int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
772 u64 size) 1063 u64 size)
773{ 1064{
774 struct ore_io_state *ios; 1065 struct ore_io_state *ios;
@@ -779,22 +1070,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
779 struct _trunc_info ti; 1070 struct _trunc_info ti;
780 int i, ret; 1071 int i, ret;
781 1072
782 ret = ore_get_io_state(layout, comps, &ios); 1073 ret = ore_get_io_state(layout, oc, &ios);
783 if (unlikely(ret)) 1074 if (unlikely(ret))
784 return ret; 1075 return ret;
785 1076
786 _calc_trunk_info(ios->layout, size, &ti); 1077 _calc_trunk_info(ios->layout, size, &ti);
787 1078
788 size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), 1079 size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
789 GFP_KERNEL); 1080 GFP_KERNEL);
790 if (unlikely(!size_attrs)) { 1081 if (unlikely(!size_attrs)) {
791 ret = -ENOMEM; 1082 ret = -ENOMEM;
792 goto out; 1083 goto out;
793 } 1084 }
794 1085
795 ios->numdevs = ios->comps->numdevs; 1086 ios->numdevs = ios->oc->numdevs;
796 1087
797 for (i = 0; i < ti.max_devs; ++i) { 1088 for (i = 0; i < ios->numdevs; ++i) {
798 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 1089 struct exofs_trunc_attr *size_attr = &size_attrs[i];
799 u64 obj_size; 1090 u64 obj_size;
800 1091
@@ -815,7 +1106,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
815 size_attr->attr.val_ptr = &size_attr->newsize; 1106 size_attr->attr.val_ptr = &size_attr->newsize;
816 1107
817 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", 1108 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
818 _LLU(comps->comps->obj.id), _LLU(obj_size), i); 1109 _LLU(oc->comps->obj.id), _LLU(obj_size), i);
819 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 1110 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
820 &size_attr->attr); 1111 &size_attr->attr);
821 if (unlikely(ret)) 1112 if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 00000000000..29c47e5c4a8
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
1/*
2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <linux/gfp.h>
17#include <linux/async_tx.h>
18
19#include "ore_raid.h"
20
21#undef ORE_DBGMSG2
22#define ORE_DBGMSG2 ORE_DBGMSG
23
24struct page *_raid_page_alloc(void)
25{
26 return alloc_page(GFP_KERNEL);
27}
28
29void _raid_page_free(struct page *p)
30{
31 __free_page(p);
32}
33
34/* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
36 *
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
40 * API.
41 */
42struct __stripe_pages_2d {
43 /* Cache some hot path repeated calculations */
44 unsigned parity;
45 unsigned data_devs;
46 unsigned pages_in_unit;
47
48 bool needed ;
49
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe {
52 bool alloc;
53 unsigned write_count;
54 struct async_submit_ctl submit;
55 struct dma_async_tx_descriptor *tx;
56
57 /* The size of this array is data_devs + parity */
58 struct page **pages;
59 struct page **scribble;
60 /* bool array, size of this array is data_devs */
61 char *page_is_read;
62 } _1p_stripes[];
63};
64
65/* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
67 * none-zero).
68 */
69static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
70 unsigned parity, struct __stripe_pages_2d **psp2d)
71{
72 struct __stripe_pages_2d *sp2d;
73 unsigned data_devs = group_width - parity;
74 struct _alloc_all_bytes {
75 struct __alloc_stripe_pages_2d {
76 struct __stripe_pages_2d sp2d;
77 struct __1_page_stripe _1p_stripes[pages_in_unit];
78 } __asp2d;
79 struct __alloc_1p_arrays {
80 struct page *pages[group_width];
81 struct page *scribble[group_width];
82 char page_is_read[data_devs];
83 } __a1pa[pages_in_unit];
84 } *_aab;
85 struct __alloc_1p_arrays *__a1pa;
86 struct __alloc_1p_arrays *__a1pa_end;
87 const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
88 unsigned num_a1pa, alloc_size, i;
89
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
92 BUG_ON(sizeof__a1pa > PAGE_SIZE);
93
94 if (sizeof(*_aab) > PAGE_SIZE) {
95 num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
96 alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
97 } else {
98 num_a1pa = pages_in_unit;
99 alloc_size = sizeof(*_aab);
100 }
101
102 _aab = kzalloc(alloc_size, GFP_KERNEL);
103 if (unlikely(!_aab)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
105 return -ENOMEM;
106 }
107
108 sp2d = &_aab->__asp2d.sp2d;
109 *psp2d = sp2d; /* From here Just call _sp2d_free */
110
111 __a1pa = _aab->__a1pa;
112 __a1pa_end = __a1pa + num_a1pa;
113
114 for (i = 0; i < pages_in_unit; ++i) {
115 if (unlikely(__a1pa >= __a1pa_end)) {
116 num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
117 pages_in_unit - i);
118
119 __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
120 if (unlikely(!__a1pa)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
122 num_a1pa);
123 return -ENOMEM;
124 }
125 __a1pa_end = __a1pa + num_a1pa;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d->_1p_stripes[i].alloc = true;
128 }
129
130 sp2d->_1p_stripes[i].pages = __a1pa->pages;
131 sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
132 sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
133 ++__a1pa;
134 }
135
136 sp2d->parity = parity;
137 sp2d->data_devs = data_devs;
138 sp2d->pages_in_unit = pages_in_unit;
139 return 0;
140}
141
142static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
143 const struct _ore_r4w_op *r4w, void *priv)
144{
145 unsigned data_devs = sp2d->data_devs;
146 unsigned group_width = data_devs + sp2d->parity;
147 unsigned p;
148
149 if (!sp2d->needed)
150 return;
151
152 for (p = 0; p < sp2d->pages_in_unit; p++) {
153 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
154
155 if (_1ps->write_count < group_width) {
156 unsigned c;
157
158 for (c = 0; c < data_devs; c++)
159 if (_1ps->page_is_read[c]) {
160 struct page *page = _1ps->pages[c];
161
162 r4w->put_page(priv, page);
163 _1ps->page_is_read[c] = false;
164 }
165 }
166
167 memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
168 _1ps->write_count = 0;
169 _1ps->tx = NULL;
170 }
171
172 sp2d->needed = false;
173}
174
175static void _sp2d_free(struct __stripe_pages_2d *sp2d)
176{
177 unsigned i;
178
179 if (!sp2d)
180 return;
181
182 for (i = 0; i < sp2d->pages_in_unit; ++i) {
183 if (sp2d->_1p_stripes[i].alloc)
184 kfree(sp2d->_1p_stripes[i].pages);
185 }
186
187 kfree(sp2d);
188}
189
190static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
191{
192 unsigned p;
193
194 for (p = 0; p < sp2d->pages_in_unit; p++) {
195 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
196
197 if (_1ps->write_count)
198 return p;
199 }
200
201 return ~0;
202}
203
204static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
205{
206 unsigned p;
207
208 for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
209 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
210
211 if (_1ps->write_count)
212 return p;
213 }
214
215 return ~0;
216}
217
218static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
219{
220 unsigned p;
221 for (p = 0; p < sp2d->pages_in_unit; p++) {
222 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
223
224 if (!_1ps->write_count)
225 continue;
226
227 init_async_submit(&_1ps->submit,
228 ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
229 NULL,
230 NULL, NULL,
231 (addr_conv_t *)_1ps->scribble);
232
233 /* TODO: raid6 */
234 _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
235 0, sp2d->data_devs, PAGE_SIZE,
236 &_1ps->submit);
237 }
238
239 for (p = 0; p < sp2d->pages_in_unit; p++) {
240 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
241 /* NOTE: We wait for HW synchronously (I don't have such HW
242 * to test with.) Is parallelism needed with today's multi
243 * cores?
244 */
245 async_tx_issue_pending(_1ps->tx);
246 }
247}
248
249void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
250 struct ore_striping_info *si, struct page *page)
251{
252 struct __1_page_stripe *_1ps;
253
254 sp2d->needed = true;
255
256 _1ps = &sp2d->_1p_stripes[si->cur_pg];
257 _1ps->pages[si->cur_comp] = page;
258 ++_1ps->write_count;
259
260 si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
261 /* si->cur_comp is advanced outside at main loop */
262}
263
264void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
265 bool not_last)
266{
267 struct osd_sg_entry *sge;
268
269 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
270 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
271 per_dev->dev, cur_len, not_last, per_dev->cur_sg,
272 _LLU(per_dev->offset), per_dev->length,
273 per_dev->last_sgs_total);
274
275 if (!per_dev->cur_sg) {
276 sge = per_dev->sglist;
277
278 /* First time we prepare two entries */
279 if (per_dev->length) {
280 ++per_dev->cur_sg;
281 sge->offset = per_dev->offset;
282 sge->len = per_dev->length;
283 } else {
284 /* Here the parity is the first unit of this object.
285 * This happens every time we reach a parity device on
286 * the same stripe as the per_dev->offset. We need to
287 * just skip this unit.
288 */
289 per_dev->offset += cur_len;
290 return;
291 }
292 } else {
293 /* finalize the last one */
294 sge = &per_dev->sglist[per_dev->cur_sg - 1];
295 sge->len = per_dev->length - per_dev->last_sgs_total;
296 }
297
298 if (not_last) {
299 /* Partly prepare the next one */
300 struct osd_sg_entry *next_sge = sge + 1;
301
302 ++per_dev->cur_sg;
303 next_sge->offset = sge->offset + sge->len + cur_len;
304 /* Save cur len so we know how mutch was added next time */
305 per_dev->last_sgs_total = per_dev->length;
306 next_sge->len = 0;
307 } else if (!sge->len) {
308 /* Optimize for when the last unit is a parity */
309 --per_dev->cur_sg;
310 }
311}
312
313static int _alloc_read_4_write(struct ore_io_state *ios)
314{
315 struct ore_layout *layout = ios->layout;
316 int ret;
317 /* We want to only read those pages not in cache so worst case
318 * is a stripe populated with every other page
319 */
320 unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
321
322 ret = _ore_get_io_state(layout, ios->oc,
323 layout->group_width * layout->mirrors_p1,
324 sgs_per_dev, 0, &ios->ios_read_4_write);
325 return ret;
326}
327
328/* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
330 */
331static int _add_to_read_4_write(struct ore_io_state *ios,
332 struct ore_striping_info *si, struct page *page)
333{
334 struct request_queue *q;
335 struct ore_per_dev_state *per_dev;
336 struct ore_io_state *read_ios;
337 unsigned first_dev = si->dev - (si->dev %
338 (ios->layout->group_width * ios->layout->mirrors_p1));
339 unsigned comp = si->dev - first_dev;
340 unsigned added_len;
341
342 if (!ios->ios_read_4_write) {
343 int ret = _alloc_read_4_write(ios);
344
345 if (unlikely(ret))
346 return ret;
347 }
348
349 read_ios = ios->ios_read_4_write;
350 read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
351
352 per_dev = &read_ios->per_dev[comp];
353 if (!per_dev->length) {
354 per_dev->bio = bio_kmalloc(GFP_KERNEL,
355 ios->sp2d->pages_in_unit);
356 if (unlikely(!per_dev->bio)) {
357 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
358 ios->sp2d->pages_in_unit);
359 return -ENOMEM;
360 }
361 per_dev->offset = si->obj_offset;
362 per_dev->dev = si->dev;
363 } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
364 u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
365
366 _ore_add_sg_seg(per_dev, gap, true);
367 }
368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
369 added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
370 if (unlikely(added_len != PAGE_SIZE)) {
371 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
372 per_dev->bio->bi_vcnt);
373 return -ENOMEM;
374 }
375
376 per_dev->length += PAGE_SIZE;
377 return 0;
378}
379
380static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
381{
382 struct bio_vec *bv;
383 unsigned i, d;
384
385 /* loop on all devices all pages */
386 for (d = 0; d < ios->numdevs; d++) {
387 struct bio *bio = ios->per_dev[d].bio;
388
389 if (!bio)
390 continue;
391
392 __bio_for_each_segment(bv, bio, i, 0) {
393 struct page *page = bv->bv_page;
394
395 SetPageUptodate(page);
396 if (PageError(page))
397 ClearPageError(page);
398 }
399 }
400}
401
402/* read_4_write is hacked to read the start of the first stripe and/or
403 * the end of the last stripe. If needed, with an sg-gap at each device/page.
404 * It is assumed to be called after the to_be_written pages of the first stripe
405 * are populating ios->sp2d[][]
406 *
407 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
408 * These pages are held at sp2d[p].pages[c] but with
409 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
410 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
411 * @uptodate=true, so we don't need to read it, only unlock, after IO.
412 *
413 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
414 * to-be-written count, we should consider the xor-in-place mode.
415 * need_to_read_pages_count is the actual number of pages not present in cache.
416 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
417 * approximation? In this mode the read pages are put in the empty places of
418 * ios->sp2d[p][*], xor is calculated the same way. These pages are
419 * allocated/freed and don't go through cache
420 */
421static int _read_4_write(struct ore_io_state *ios)
422{
423 struct ore_io_state *ios_read;
424 struct ore_striping_info read_si;
425 struct __stripe_pages_2d *sp2d = ios->sp2d;
426 u64 offset = ios->si.first_stripe_start;
427 u64 last_stripe_end;
428 unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
429 unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
430 int ret;
431
432 if (offset == ios->offset) /* Go to start collect $200 */
433 goto read_last_stripe;
434
435 min_p = _sp2d_min_pg(sp2d);
436 max_p = _sp2d_max_pg(sp2d);
437
438 for (c = 0; ; c++) {
439 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
440 read_si.obj_offset += min_p * PAGE_SIZE;
441 offset += min_p * PAGE_SIZE;
442 for (p = min_p; p <= max_p; p++) {
443 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
444 struct page **pp = &_1ps->pages[c];
445 bool uptodate;
446
447 if (*pp)
448 /* to-be-written pages start here */
449 goto read_last_stripe;
450
451 *pp = ios->r4w->get_page(ios->private, offset,
452 &uptodate);
453 if (unlikely(!*pp))
454 return -ENOMEM;
455
456 if (!uptodate)
457 _add_to_read_4_write(ios, &read_si, *pp);
458
459 /* Mark read-pages to be cache_released */
460 _1ps->page_is_read[c] = true;
461 read_si.obj_offset += PAGE_SIZE;
462 offset += PAGE_SIZE;
463 }
464 offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
465 }
466
467read_last_stripe:
468 offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
469 PAGE_SIZE * PAGE_SIZE;
470 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
471 * bytes_in_stripe;
472 if (offset == last_stripe_end) /* Optimize for the aligned case */
473 goto read_it;
474
475 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
476 p = read_si.unit_off / PAGE_SIZE;
477 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
478 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
479
480 BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
481 /* unaligned IO must be within a single stripe */
482
483 if (min_p == sp2d->pages_in_unit) {
484 /* Didn't do it yet */
485 min_p = _sp2d_min_pg(sp2d);
486 max_p = _sp2d_max_pg(sp2d);
487 }
488
489 while (offset < last_stripe_end) {
490 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
491
492 if ((min_p <= p) && (p <= max_p)) {
493 struct page *page;
494 bool uptodate;
495
496 BUG_ON(_1ps->pages[c]);
497 page = ios->r4w->get_page(ios->private, offset,
498 &uptodate);
499 if (unlikely(!page))
500 return -ENOMEM;
501
502 _1ps->pages[c] = page;
503 /* Mark read-pages to be cache_released */
504 _1ps->page_is_read[c] = true;
505 if (!uptodate)
506 _add_to_read_4_write(ios, &read_si, page);
507 }
508
509 offset += PAGE_SIZE;
510 if (p == (sp2d->pages_in_unit - 1)) {
511 ++c;
512 p = 0;
513 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
514 } else {
515 read_si.obj_offset += PAGE_SIZE;
516 ++p;
517 }
518 }
519
520read_it:
521 ios_read = ios->ios_read_4_write;
522 if (!ios_read)
523 return 0;
524
525 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
526 * to check for per_dev->bio
527 */
528 ios_read->pages = ios->pages;
529
530 /* Now read these devices */
531 for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
532 ret = _ore_read_mirror(ios_read, i);
533 if (unlikely(ret))
534 return ret;
535 }
536
537 ret = ore_io_execute(ios_read); /* Synchronus execution */
538 if (unlikely(ret)) {
539 ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
540 return ret;
541 }
542
543 _mark_read4write_pages_uptodate(ios_read, ret);
544 return 0;
545}
546
547/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
548int _ore_add_parity_unit(struct ore_io_state *ios,
549 struct ore_striping_info *si,
550 struct ore_per_dev_state *per_dev,
551 unsigned cur_len)
552{
553 if (ios->reading) {
554 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
555 _ore_add_sg_seg(per_dev, cur_len, true);
556 } else {
557 struct __stripe_pages_2d *sp2d = ios->sp2d;
558 struct page **pages = ios->parity_pages + ios->cur_par_page;
559 unsigned num_pages;
560 unsigned array_start = 0;
561 unsigned i;
562 int ret;
563
564 si->cur_pg = _sp2d_min_pg(sp2d);
565 num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
566
567 if (!cur_len) /* If last stripe operate on parity comp */
568 si->cur_comp = sp2d->data_devs;
569
570 if (!per_dev->length) {
571 per_dev->offset += si->cur_pg * PAGE_SIZE;
572 /* If first stripe, Read in all read4write pages
573 * (if needed) before we calculate the first parity.
574 */
575 _read_4_write(ios);
576 }
577
578 for (i = 0; i < num_pages; i++) {
579 pages[i] = _raid_page_alloc();
580 if (unlikely(!pages[i]))
581 return -ENOMEM;
582
583 ++(ios->cur_par_page);
584 }
585
586 BUG_ON(si->cur_comp != sp2d->data_devs);
587 BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
588
589 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
590 per_dev, num_pages * PAGE_SIZE);
591 if (unlikely(ret))
592 return ret;
593
594 /* TODO: raid6 if (last_parity_dev) */
595 _gen_xor_unit(sp2d);
596 _sp2d_reset(sp2d, ios->r4w, ios->private);
597 }
598 return 0;
599}
600
601int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
602{
603 struct ore_layout *layout = ios->layout;
604
605 if (ios->parity_pages) {
606 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
607 unsigned stripe_size = ios->si.bytes_in_stripe;
608 u64 last_stripe, first_stripe;
609
610 if (_sp2d_alloc(pages_in_unit, layout->group_width,
611 layout->parity, &ios->sp2d)) {
612 return -ENOMEM;
613 }
614
615 BUG_ON(ios->offset % PAGE_SIZE);
616
617 /* Round io down to last full strip */
618 first_stripe = div_u64(ios->offset, stripe_size);
619 last_stripe = div_u64(ios->offset + ios->length, stripe_size);
620
621 /* If an IO spans more then a single stripe it must end at
622 * a stripe boundary. The reminder at the end is pushed into the
623 * next IO.
624 */
625 if (last_stripe != first_stripe) {
626 ios->length = last_stripe * stripe_size - ios->offset;
627
628 BUG_ON(!ios->length);
629 ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
630 PAGE_SIZE;
631 ios->si.length = ios->length; /*make it consistent */
632 }
633 }
634 return 0;
635}
636
637void _ore_free_raid_stuff(struct ore_io_state *ios)
638{
639 if (ios->sp2d) { /* writing and raid */
640 unsigned i;
641
642 for (i = 0; i < ios->cur_par_page; i++) {
643 struct page *page = ios->parity_pages[i];
644
645 if (page)
646 _raid_page_free(page);
647 }
648 if (ios->extra_part_alloc)
649 kfree(ios->parity_pages);
650 /* If IO returned an error pages might need unlocking */
651 _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
652 _sp2d_free(ios->sp2d);
653 } else {
654 /* Will only be set if raid reading && sglist is big */
655 if (ios->extra_part_alloc)
656 kfree(ios->per_dev[0].sglist);
657 }
658 if (ios->ios_read_4_write)
659 ore_put_io_state(ios->ios_read_4_write);
660}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 00000000000..2ffd2c3c6e4
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
1/*
2 * Copyright (C) from 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <scsi/osd_ore.h>
17
18#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
19
20#ifdef CONFIG_EXOFS_DEBUG
21#define ORE_DBGMSG(fmt, a...) \
22 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
23#else
24#define ORE_DBGMSG(fmt, a...) \
25 do { if (0) printk(fmt, ##a); } while (0)
26#endif
27
28/* u64 has problems with printk this will cast it to unsigned long long */
29#define _LLU(x) (unsigned long long)(x)
30
31#define ORE_DBGMSG2(M...) do {} while (0)
32/* #define ORE_DBGMSG2 ORE_DBGMSG */
33
34/* Calculate the component order in a stripe. eg the logical data unit
35 * address within the stripe of @dev given the @par_dev of this stripe.
36 */
37static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
38 unsigned par_dev, unsigned dev)
39{
40 unsigned first_dev = dev - dev % devs_in_group;
41
42 dev -= first_dev;
43 par_dev -= first_dev;
44
45 if (devs_in_group == par_dev) /* The raid 0 case */
46 return dev / mirrors_p1;
47 /* raid4/5/6 case */
48 return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
49 mirrors_p1;
50}
51
52/* ios_raid.c stuff needed by ios.c */
53int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
54void _ore_free_raid_stuff(struct ore_io_state *ios);
55
56void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len);
60void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
61 struct ore_striping_info *si, struct page *page);
62static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
63 struct ore_striping_info *si, struct page *page)
64{
65 if (!sp2d) /* Inline the fast path */
66 return; /* Hay no raid stuff */
67 _ore_add_stripe_page(sp2d, si, page);
68}
69
70/* ios.c stuff needed by ios_raid.c */
71int _ore_get_io_state(struct ore_layout *layout,
72 struct ore_components *oc, unsigned numdevs,
73 unsigned sgs_per_dev, unsigned num_par_pages,
74 struct ore_io_state **pios);
75int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
76 unsigned pgbase, struct page **pages,
77 struct ore_per_dev_state *per_dev, int cur_len);
78int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
79int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b0..e6085ec192d 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
35#include <linux/parser.h> 35#include <linux/parser.h>
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/random.h> 37#include <linux/random.h>
38#include <linux/module.h>
38#include <linux/exportfs.h> 39#include <linux/exportfs.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40 41
@@ -266,7 +267,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
266 struct ore_io_state *ios; 267 struct ore_io_state *ios;
267 int ret; 268 int ret;
268 269
269 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 270 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
270 if (unlikely(ret)) { 271 if (unlikely(ret)) {
271 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 272 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
272 return ret; 273 return ret;
@@ -321,7 +322,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
321 struct ore_io_state *ios; 322 struct ore_io_state *ios;
322 int ret; 323 int ret;
323 324
324 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 325 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
325 if (unlikely(ret)) { 326 if (unlikely(ret)) {
326 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 327 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
327 return ret; 328 return ret;
@@ -355,12 +356,12 @@ static const struct export_operations exofs_export_ops;
355/* 356/*
356 * Write the superblock to the OSD 357 * Write the superblock to the OSD
357 */ 358 */
358int exofs_sync_fs(struct super_block *sb, int wait) 359static int exofs_sync_fs(struct super_block *sb, int wait)
359{ 360{
360 struct exofs_sb_info *sbi; 361 struct exofs_sb_info *sbi;
361 struct exofs_fscb *fscb; 362 struct exofs_fscb *fscb;
362 struct ore_comp one_comp; 363 struct ore_comp one_comp;
363 struct ore_components comps; 364 struct ore_components oc;
364 struct ore_io_state *ios; 365 struct ore_io_state *ios;
365 int ret = -ENOMEM; 366 int ret = -ENOMEM;
366 367
@@ -378,9 +379,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
378 * the writeable info is set in exofs_sbi_write_stats() above. 379 * the writeable info is set in exofs_sbi_write_stats() above.
379 */ 380 */
380 381
381 exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); 382 exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
382 383
383 ret = ore_get_io_state(&sbi->layout, &comps, &ios); 384 ret = ore_get_io_state(&sbi->layout, &oc, &ios);
384 if (unlikely(ret)) 385 if (unlikely(ret))
385 goto out; 386 goto out;
386 387
@@ -429,19 +430,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
429 msg, dev_path ?: "", odi->osdname, _LLU(pid)); 430 msg, dev_path ?: "", odi->osdname, _LLU(pid));
430} 431}
431 432
432void exofs_free_sbi(struct exofs_sb_info *sbi) 433static void exofs_free_sbi(struct exofs_sb_info *sbi)
433{ 434{
434 while (sbi->comps.numdevs) { 435 unsigned numdevs = sbi->oc.numdevs;
435 int i = --sbi->comps.numdevs; 436
436 struct osd_dev *od = sbi->comps.ods[i]; 437 while (numdevs) {
438 unsigned i = --numdevs;
439 struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
437 440
438 if (od) { 441 if (od) {
439 sbi->comps.ods[i] = NULL; 442 ore_comp_set_dev(&sbi->oc, i, NULL);
440 osduld_put_device(od); 443 osduld_put_device(od);
441 } 444 }
442 } 445 }
443 if (sbi->comps.ods != sbi->_min_one_dev) 446 kfree(sbi->oc.ods);
444 kfree(sbi->comps.ods);
445 kfree(sbi); 447 kfree(sbi);
446} 448}
447 449
@@ -468,7 +470,7 @@ static void exofs_put_super(struct super_block *sb)
468 msecs_to_jiffies(100)); 470 msecs_to_jiffies(100));
469 } 471 }
470 472
471 _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], 473 _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
472 sbi->one_comp.obj.partition); 474 sbi->one_comp.obj.partition);
473 475
474 bdi_destroy(&sbi->bdi); 476 bdi_destroy(&sbi->bdi);
@@ -479,76 +481,20 @@ static void exofs_put_super(struct super_block *sb)
479static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 481static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
480 struct exofs_device_table *dt) 482 struct exofs_device_table *dt)
481{ 483{
482 u64 stripe_length; 484 int ret;
483 485
484 sbi->data_map.odm_num_comps = 486 sbi->layout.stripe_unit =
485 le32_to_cpu(dt->dt_data_map.cb_num_comps);
486 sbi->data_map.odm_stripe_unit =
487 le64_to_cpu(dt->dt_data_map.cb_stripe_unit); 487 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
488 sbi->data_map.odm_group_width = 488 sbi->layout.group_width =
489 le32_to_cpu(dt->dt_data_map.cb_group_width); 489 le32_to_cpu(dt->dt_data_map.cb_group_width);
490 sbi->data_map.odm_group_depth = 490 sbi->layout.group_depth =
491 le32_to_cpu(dt->dt_data_map.cb_group_depth); 491 le32_to_cpu(dt->dt_data_map.cb_group_depth);
492 sbi->data_map.odm_mirror_cnt = 492 sbi->layout.mirrors_p1 =
493 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); 493 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
494 sbi->data_map.odm_raid_algorithm = 494 sbi->layout.raid_algorithm =
495 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 495 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
496 496
497/* FIXME: Only raid0 for now. if not so, do not mount */ 497 ret = ore_verify_layout(numdevs, &sbi->layout);
498 if (sbi->data_map.odm_num_comps != numdevs) {
499 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
500 sbi->data_map.odm_num_comps, numdevs);
501 return -EINVAL;
502 }
503 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
504 EXOFS_ERR("Only RAID_0 for now\n");
505 return -EINVAL;
506 }
507 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
508 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
509 numdevs, sbi->data_map.odm_mirror_cnt);
510 return -EINVAL;
511 }
512
513 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
514 EXOFS_ERR("Stripe Unit(0x%llx)"
515 " must be Multples of PAGE_SIZE(0x%lx)\n",
516 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
517 return -EINVAL;
518 }
519
520 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
521 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
522
523 if (sbi->data_map.odm_group_width) {
524 sbi->layout.group_width = sbi->data_map.odm_group_width;
525 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
526 if (!sbi->layout.group_depth) {
527 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
528 return -EINVAL;
529 }
530 sbi->layout.group_count = sbi->data_map.odm_num_comps /
531 sbi->layout.mirrors_p1 /
532 sbi->data_map.odm_group_width;
533 } else {
534 if (sbi->data_map.odm_group_depth) {
535 printk(KERN_NOTICE "Warning: group_depth ignored "
536 "group_width == 0 && group_depth == %d\n",
537 sbi->data_map.odm_group_depth);
538 sbi->data_map.odm_group_depth = 0;
539 }
540 sbi->layout.group_width = sbi->data_map.odm_num_comps /
541 sbi->layout.mirrors_p1;
542 sbi->layout.group_depth = -1;
543 sbi->layout.group_count = 1;
544 }
545
546 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
547 if (stripe_length >= (1ULL << 32)) {
548 EXOFS_ERR("Total Stripe length(0x%llx)"
549 " >= 32bit is not supported\n", _LLU(stripe_length));
550 return -EINVAL;
551 }
552 498
553 EXOFS_DBGMSG("exofs: layout: " 499 EXOFS_DBGMSG("exofs: layout: "
554 "num_comps=%u stripe_unit=0x%x group_width=%u " 500 "num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +504,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
558 sbi->layout.group_width, 504 sbi->layout.group_width,
559 _LLU(sbi->layout.group_depth), 505 _LLU(sbi->layout.group_depth),
560 sbi->layout.mirrors_p1, 506 sbi->layout.mirrors_p1,
561 sbi->data_map.odm_raid_algorithm); 507 sbi->layout.raid_algorithm);
562 return 0; 508 return ret;
563} 509}
564 510
565static unsigned __ra_pages(struct ore_layout *layout) 511static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +551,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
605 return !(odi->systemid_len || odi->osdname_len); 551 return !(odi->systemid_len || odi->osdname_len);
606} 552}
607 553
554int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
555 struct exofs_dev **peds)
556{
557 struct __alloc_ore_devs_and_exofs_devs {
558 /* Twice bigger table: See exofs_init_comps() and comment at
559 * exofs_read_lookup_dev_table()
560 */
561 struct ore_dev *oreds[numdevs * 2 - 1];
562 struct exofs_dev eds[numdevs];
563 } *aoded;
564 struct exofs_dev *eds;
565 unsigned i;
566
567 aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
568 if (unlikely(!aoded)) {
569 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
570 numdevs);
571 return -ENOMEM;
572 }
573
574 sbi->oc.ods = aoded->oreds;
575 *peds = eds = aoded->eds;
576 for (i = 0; i < numdevs; ++i)
577 aoded->oreds[i] = &eds[i].ored;
578 return 0;
579}
580
608static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, 581static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
609 struct osd_dev *fscb_od, 582 struct osd_dev *fscb_od,
610 unsigned table_count) 583 unsigned table_count)
611{ 584{
612 struct ore_comp comp; 585 struct ore_comp comp;
613 struct exofs_device_table *dt; 586 struct exofs_device_table *dt;
587 struct exofs_dev *eds;
614 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 588 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
615 sizeof(*dt); 589 sizeof(*dt);
616 unsigned numdevs, i; 590 unsigned numdevs, i;
@@ -623,7 +597,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
623 return -ENOMEM; 597 return -ENOMEM;
624 } 598 }
625 599
626 sbi->comps.numdevs = 0; 600 sbi->oc.numdevs = 0;
627 601
628 comp.obj.partition = sbi->one_comp.obj.partition; 602 comp.obj.partition = sbi->one_comp.obj.partition;
629 comp.obj.id = EXOFS_DEVTABLE_ID; 603 comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +621,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
647 if (unlikely(ret)) 621 if (unlikely(ret))
648 goto out; 622 goto out;
649 623
650 if (likely(numdevs > 1)) { 624 ret = __alloc_dev_table(sbi, numdevs, &eds);
651 unsigned size = numdevs * sizeof(sbi->comps.ods[0]); 625 if (unlikely(ret))
652 626 goto out;
653 /* Twice bigger table: See exofs_init_comps() and below 627 /* exofs round-robins the device table view according to inode
654 * comment 628 * number. We hold a: twice bigger table hence inodes can point
655 */ 629 * to any device and have a sequential view of the table
656 sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); 630 * starting at this device. See exofs_init_comps()
657 if (unlikely(!sbi->comps.ods)) { 631 */
658 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", 632 memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
659 numdevs); 633 (numdevs - 1) * sizeof(sbi->oc.ods[0]));
660 ret = -ENOMEM;
661 goto out;
662 }
663 }
664 634
665 for (i = 0; i < numdevs; i++) { 635 for (i = 0; i < numdevs; i++) {
666 struct exofs_fscb fscb; 636 struct exofs_fscb fscb;
@@ -676,13 +646,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
676 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", 646 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
677 i, odi.osdname); 647 i, odi.osdname);
678 648
649 /* the exofs id is currently the table index */
650 eds[i].did = i;
651
679 /* On all devices the device table is identical. The user can 652 /* On all devices the device table is identical. The user can
680 * specify any one of the participating devices on the command 653 * specify any one of the participating devices on the command
681 * line. We always keep them in device-table order. 654 * line. We always keep them in device-table order.
682 */ 655 */
683 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 656 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
684 sbi->comps.ods[i] = fscb_od; 657 eds[i].ored.od = fscb_od;
685 ++sbi->comps.numdevs; 658 ++sbi->oc.numdevs;
686 fscb_od = NULL; 659 fscb_od = NULL;
687 continue; 660 continue;
688 } 661 }
@@ -695,8 +668,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
695 goto out; 668 goto out;
696 } 669 }
697 670
698 sbi->comps.ods[i] = od; 671 eds[i].ored.od = od;
699 ++sbi->comps.numdevs; 672 ++sbi->oc.numdevs;
700 673
701 /* Read the fscb of the other devices to make sure the FS 674 /* Read the fscb of the other devices to make sure the FS
702 * partition is there. 675 * partition is there.
@@ -718,21 +691,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
718 691
719out: 692out:
720 kfree(dt); 693 kfree(dt);
721 if (likely(!ret)) { 694 if (unlikely(fscb_od && !ret)) {
722 unsigned numdevs = sbi->comps.numdevs;
723
724 if (unlikely(fscb_od)) {
725 EXOFS_ERR("ERROR: Bad device-table container device not present\n"); 695 EXOFS_ERR("ERROR: Bad device-table container device not present\n");
726 osduld_put_device(fscb_od); 696 osduld_put_device(fscb_od);
727 return -EINVAL; 697 return -EINVAL;
728 }
729 /* exofs round-robins the device table view according to inode
730 * number. We hold a: twice bigger table hence inodes can point
731 * to any device and have a sequential view of the table
732 * starting at this device. See exofs_init_comps()
733 */
734 for (i = 0; i < numdevs - 1; ++i)
735 sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
736 } 698 }
737 return ret; 699 return ret;
738} 700}
@@ -783,10 +745,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
783 sbi->one_comp.obj.partition = opts->pid; 745 sbi->one_comp.obj.partition = opts->pid;
784 sbi->one_comp.obj.id = 0; 746 sbi->one_comp.obj.id = 0;
785 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); 747 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
786 sbi->comps.numdevs = 1; 748 sbi->oc.numdevs = 1;
787 sbi->comps.single_comp = EC_SINGLE_COMP; 749 sbi->oc.single_comp = EC_SINGLE_COMP;
788 sbi->comps.comps = &sbi->one_comp; 750 sbi->oc.comps = &sbi->one_comp;
789 sbi->comps.ods = sbi->_min_one_dev;
790 751
791 /* fill in some other data by hand */ 752 /* fill in some other data by hand */
792 memset(sb->s_id, 0, sizeof(sb->s_id)); 753 memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +796,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
835 if (unlikely(ret)) 796 if (unlikely(ret))
836 goto free_sbi; 797 goto free_sbi;
837 } else { 798 } else {
838 sbi->comps.ods[0] = od; 799 struct exofs_dev *eds;
800
801 ret = __alloc_dev_table(sbi, 1, &eds);
802 if (unlikely(ret))
803 goto free_sbi;
804
805 ore_comp_set_dev(&sbi->oc, 0, od);
839 } 806 }
840 807
841 __sbi_read_stats(sbi); 808 __sbi_read_stats(sbi);
@@ -875,7 +842,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
875 goto free_sbi; 842 goto free_sbi;
876 } 843 }
877 844
878 _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], 845 _exofs_print_device("Mounting", opts->dev_name,
846 ore_comp_dev(&sbi->oc, 0),
879 sbi->one_comp.obj.partition); 847 sbi->one_comp.obj.partition);
880 return 0; 848 return 0;
881 849
@@ -924,7 +892,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
924 uint64_t used = ULLONG_MAX; 892 uint64_t used = ULLONG_MAX;
925 int ret; 893 int ret;
926 894
927 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 895 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
928 if (ret) { 896 if (ret) {
929 EXOFS_DBGMSG("ore_get_io_state failed.\n"); 897 EXOFS_DBGMSG("ore_get_io_state failed.\n");
930 return ret; 898 return ret;
@@ -981,7 +949,7 @@ static const struct super_operations exofs_sops = {
981 * EXPORT OPERATIONS 949 * EXPORT OPERATIONS
982 *****************************************************************************/ 950 *****************************************************************************/
983 951
984struct dentry *exofs_get_parent(struct dentry *child) 952static struct dentry *exofs_get_parent(struct dentry *child)
985{ 953{
986 unsigned long ino = exofs_parent_ino(child); 954 unsigned long ino = exofs_parent_ino(child);
987 955
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 8f44cef1b3e..a8cbe1bc6ad 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
421void ext2_init_block_alloc_info(struct inode *inode) 421void ext2_init_block_alloc_info(struct inode *inode)
422{ 422{
423 struct ext2_inode_info *ei = EXT2_I(inode); 423 struct ext2_inode_info *ei = EXT2_I(inode);
424 struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; 424 struct ext2_block_alloc_info *block_i;
425 struct super_block *sb = inode->i_sb; 425 struct super_block *sb = inode->i_sb;
426 426
427 block_i = kmalloc(sizeof(*block_i), GFP_NOFS); 427 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index af9fc89b1b2..9a4e5e206d0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
135struct dentry *ext2_get_parent(struct dentry *child); 135struct dentry *ext2_get_parent(struct dentry *child);
136 136
137/* super.c */ 137/* super.c */
138extern void ext2_error (struct super_block *, const char *, const char *, ...) 138extern __printf(3, 4)
139 __attribute__ ((format (printf, 3, 4))); 139void ext2_error(struct super_block *, const char *, const char *, ...);
140extern void ext2_msg(struct super_block *, const char *, const char *, ...) 140extern __printf(3, 4)
141 __attribute__ ((format (printf, 3, 4))); 141void ext2_msg(struct super_block *, const char *, const char *, ...);
142extern void ext2_update_dynamic_rev (struct super_block *sb); 142extern void ext2_update_dynamic_rev (struct super_block *sb);
143extern void ext2_write_super (struct super_block *); 143extern void ext2_write_super (struct super_block *);
144 144
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ee9ed31948e..c4e81dfb74b 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -601,7 +601,7 @@ fail_free_drop:
601fail_drop: 601fail_drop:
602 dquot_drop(inode); 602 dquot_drop(inode);
603 inode->i_flags |= S_NOQUOTA; 603 inode->i_flags |= S_NOQUOTA;
604 inode->i_nlink = 0; 604 clear_nlink(inode);
605 unlock_new_inode(inode); 605 unlock_new_inode(inode);
606 iput(inode); 606 iput(inode);
607 return ERR_PTR(err); 607 return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a8a58f63f07..91a6945af6d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1321 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 1321 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
1322 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 1322 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
1323 } 1323 }
1324 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 1324 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
1325 inode->i_size = le32_to_cpu(raw_inode->i_size); 1325 inode->i_size = le32_to_cpu(raw_inode->i_size);
1326 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); 1326 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
1327 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); 1327 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1dd62ed35b8..bd8ac164a3b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb,
327 if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) 327 if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
328 return ERR_PTR(-ESTALE); 328 return ERR_PTR(-ESTALE);
329 329
330 /* iget isn't really right if the inode is currently unallocated!! 330 /*
331 * ext2_read_inode currently does appropriate checks, but 331 * ext2_iget isn't quite right if the inode is currently unallocated!
332 * it might be "neater" to call ext2_get_inode first and check 332 * However ext2_iget currently does appropriate checks to handle stale
333 * if the inode is valid..... 333 * inodes so everything is OK.
334 */ 334 */
335 inode = ext2_iget(sb, ino); 335 inode = ext2_iget(sb, ino);
336 if (IS_ERR(inode)) 336 if (IS_ERR(inode))
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 6386d76f44a..a2038928f9a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
427void ext3_init_block_alloc_info(struct inode *inode) 427void ext3_init_block_alloc_info(struct inode *inode)
428{ 428{
429 struct ext3_inode_info *ei = EXT3_I(inode); 429 struct ext3_inode_info *ei = EXT3_I(inode);
430 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; 430 struct ext3_block_alloc_info *block_i;
431 struct super_block *sb = inode->i_sb; 431 struct super_block *sb = inode->i_sb;
432 432
433 block_i = kmalloc(sizeof(*block_i), GFP_NOFS); 433 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
@@ -1440,14 +1440,14 @@ out:
1440 * 1440 *
1441 * Check if filesystem has at least 1 free block available for allocation. 1441 * Check if filesystem has at least 1 free block available for allocation.
1442 */ 1442 */
1443static int ext3_has_free_blocks(struct ext3_sb_info *sbi) 1443static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
1444{ 1444{
1445 ext3_fsblk_t free_blocks, root_blocks; 1445 ext3_fsblk_t free_blocks, root_blocks;
1446 1446
1447 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 1447 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1448 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); 1448 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1449 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && 1449 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1450 sbi->s_resuid != current_fsuid() && 1450 !use_reservation && sbi->s_resuid != current_fsuid() &&
1451 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { 1451 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1452 return 0; 1452 return 0;
1453 } 1453 }
@@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1468 */ 1468 */
1469int ext3_should_retry_alloc(struct super_block *sb, int *retries) 1469int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1470{ 1470{
1471 if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) 1471 if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
1472 return 0; 1472 return 0;
1473 1473
1474 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 1474 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1546 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) 1546 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1547 my_rsv = &block_i->rsv_window_node; 1547 my_rsv = &block_i->rsv_window_node;
1548 1548
1549 if (!ext3_has_free_blocks(sbi)) { 1549 if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
1550 *errp = -ENOSPC; 1550 *errp = -ENOSPC;
1551 goto out; 1551 goto out;
1552 } 1552 }
@@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1924 * reaches any used block. Then issue a TRIM command on this extent and free 1924 * reaches any used block. Then issue a TRIM command on this extent and free
1925 * the extent in the block bitmap. This is done until whole group is scanned. 1925 * the extent in the block bitmap. This is done until whole group is scanned.
1926 */ 1926 */
1927ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, 1927static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
1928 ext3_grpblk_t start, ext3_grpblk_t max, 1928 unsigned int group,
1929 ext3_grpblk_t minblocks) 1929 ext3_grpblk_t start, ext3_grpblk_t max,
1930 ext3_grpblk_t minblocks)
1930{ 1931{
1931 handle_t *handle; 1932 handle_t *handle;
1932 ext3_grpblk_t next, free_blocks, bit, freed, count = 0; 1933 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d494c554c6e..1860ed35632 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
61 if (ret) 61 if (ret)
62 goto out; 62 goto out;
63 63
64 /*
65 * Taking the mutex here just to keep consistent with how fsync was
66 * called previously, however it looks like we don't need to take
67 * i_mutex at all.
68 */
69 mutex_lock(&inode->i_mutex);
70
71 J_ASSERT(ext3_journal_current_handle() == NULL); 64 J_ASSERT(ext3_journal_current_handle() == NULL);
72 65
73 /* 66 /*
@@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
85 * safe in-journal, which is all fsync() needs to ensure. 78 * safe in-journal, which is all fsync() needs to ensure.
86 */ 79 */
87 if (ext3_should_journal_data(inode)) { 80 if (ext3_should_journal_data(inode)) {
88 mutex_unlock(&inode->i_mutex);
89 ret = ext3_force_commit(inode->i_sb); 81 ret = ext3_force_commit(inode->i_sb);
90 goto out; 82 goto out;
91 } 83 }
@@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
108 */ 100 */
109 if (needs_barrier) 101 if (needs_barrier)
110 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 102 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
111
112 mutex_unlock(&inode->i_mutex);
113out: 103out:
114 trace_ext3_sync_file_exit(inode, ret); 104 trace_ext3_sync_file_exit(inode, ret);
115 return ret; 105 return ret;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bf09cbf938c..5c866e06e7a 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -178,42 +178,6 @@ error_return:
178} 178}
179 179
180/* 180/*
181 * There are two policies for allocating an inode. If the new inode is
182 * a directory, then a forward search is made for a block group with both
183 * free space and a low directory-to-inode ratio; if that fails, then of
184 * the groups with above-average free space, that group with the fewest
185 * directories already is chosen.
186 *
187 * For other inodes, search forward from the parent directory\'s block
188 * group to find a free inode.
189 */
190static int find_group_dir(struct super_block *sb, struct inode *parent)
191{
192 int ngroups = EXT3_SB(sb)->s_groups_count;
193 unsigned int freei, avefreei;
194 struct ext3_group_desc *desc, *best_desc = NULL;
195 int group, best_group = -1;
196
197 freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
198 avefreei = freei / ngroups;
199
200 for (group = 0; group < ngroups; group++) {
201 desc = ext3_get_group_desc (sb, group, NULL);
202 if (!desc || !desc->bg_free_inodes_count)
203 continue;
204 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
205 continue;
206 if (!best_desc ||
207 (le16_to_cpu(desc->bg_free_blocks_count) >
208 le16_to_cpu(best_desc->bg_free_blocks_count))) {
209 best_group = group;
210 best_desc = desc;
211 }
212 }
213 return best_group;
214}
215
216/*
217 * Orlov's allocator for directories. 181 * Orlov's allocator for directories.
218 * 182 *
219 * We always try to spread first-level directories. 183 * We always try to spread first-level directories.
@@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
436 400
437 sbi = EXT3_SB(sb); 401 sbi = EXT3_SB(sb);
438 es = sbi->s_es; 402 es = sbi->s_es;
439 if (S_ISDIR(mode)) { 403 if (S_ISDIR(mode))
440 if (test_opt (sb, OLDALLOC)) 404 group = find_group_orlov(sb, dir);
441 group = find_group_dir(sb, dir); 405 else
442 else
443 group = find_group_orlov(sb, dir);
444 } else
445 group = find_group_other(sb, dir); 406 group = find_group_other(sb, dir);
446 407
447 err = -ENOSPC; 408 err = -ENOSPC;
@@ -621,7 +582,7 @@ fail_free_drop:
621fail_drop: 582fail_drop:
622 dquot_drop(inode); 583 dquot_drop(inode);
623 inode->i_flags |= S_NOQUOTA; 584 inode->i_flags |= S_NOQUOTA;
624 inode->i_nlink = 0; 585 clear_nlink(inode);
625 unlock_new_inode(inode); 586 unlock_new_inode(inode);
626 iput(inode); 587 iput(inode);
627 brelse(bitmap_bh); 588 brelse(bitmap_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 12661e1deed..85fe655fe3e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2899 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 2899 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2900 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2900 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2901 } 2901 }
2902 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2902 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
2903 inode->i_size = le32_to_cpu(raw_inode->i_size); 2903 inode->i_size = le32_to_cpu(raw_inode->i_size);
2904 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); 2904 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
2905 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); 2905 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index c7f43944f16..ba1b54e23ca 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -150,30 +150,6 @@ setversion_out:
150 mnt_drop_write(filp->f_path.mnt); 150 mnt_drop_write(filp->f_path.mnt);
151 return err; 151 return err;
152 } 152 }
153#ifdef CONFIG_JBD_DEBUG
154 case EXT3_IOC_WAIT_FOR_READONLY:
155 /*
156 * This is racy - by the time we're woken up and running,
157 * the superblock could be released. And the module could
158 * have been unloaded. So sue me.
159 *
160 * Returns 1 if it slept, else zero.
161 */
162 {
163 struct super_block *sb = inode->i_sb;
164 DECLARE_WAITQUEUE(wait, current);
165 int ret = 0;
166
167 set_current_state(TASK_INTERRUPTIBLE);
168 add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
169 if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
170 schedule();
171 ret = 1;
172 }
173 remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
174 return ret;
175 }
176#endif
177 case EXT3_IOC_GETRSVSZ: 153 case EXT3_IOC_GETRSVSZ:
178 if (test_opt(inode->i_sb, RESERVATION) 154 if (test_opt(inode->i_sb, RESERVATION)
179 && S_ISREG(inode->i_mode) 155 && S_ISREG(inode->i_mode)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0629e09f651..642dc6d66df 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1821,7 +1821,7 @@ retry:
1821 de->name_len = 2; 1821 de->name_len = 2;
1822 strcpy (de->name, ".."); 1822 strcpy (de->name, "..");
1823 ext3_set_de_type(dir->i_sb, de, S_IFDIR); 1823 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1824 inode->i_nlink = 2; 1824 set_nlink(inode, 2);
1825 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); 1825 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1826 err = ext3_journal_dirty_metadata(handle, dir_block); 1826 err = ext3_journal_dirty_metadata(handle, dir_block);
1827 if (err) 1827 if (err)
@@ -1833,7 +1833,7 @@ retry:
1833 1833
1834 if (err) { 1834 if (err) {
1835out_clear_inode: 1835out_clear_inode:
1836 inode->i_nlink = 0; 1836 clear_nlink(inode);
1837 unlock_new_inode(inode); 1837 unlock_new_inode(inode);
1838 ext3_mark_inode_dirty(handle, inode); 1838 ext3_mark_inode_dirty(handle, inode);
1839 iput (inode); 1839 iput (inode);
@@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2170 ext3_warning (inode->i_sb, "ext3_unlink", 2170 ext3_warning (inode->i_sb, "ext3_unlink",
2171 "Deleting nonexistent file (%lu), %d", 2171 "Deleting nonexistent file (%lu), %d",
2172 inode->i_ino, inode->i_nlink); 2172 inode->i_ino, inode->i_nlink);
2173 inode->i_nlink = 1; 2173 set_nlink(inode, 1);
2174 } 2174 }
2175 retval = ext3_delete_entry(handle, dir, de, bh); 2175 retval = ext3_delete_entry(handle, dir, de, bh);
2176 if (retval) 2176 if (retval)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7beb69ae001..922d289aeeb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
652 seq_puts(seq, ",nouid32"); 652 seq_puts(seq, ",nouid32");
653 if (test_opt(sb, DEBUG)) 653 if (test_opt(sb, DEBUG))
654 seq_puts(seq, ",debug"); 654 seq_puts(seq, ",debug");
655 if (test_opt(sb, OLDALLOC))
656 seq_puts(seq, ",oldalloc");
657#ifdef CONFIG_EXT3_FS_XATTR 655#ifdef CONFIG_EXT3_FS_XATTR
658 if (test_opt(sb, XATTR_USER)) 656 if (test_opt(sb, XATTR_USER))
659 seq_puts(seq, ",user_xattr"); 657 seq_puts(seq, ",user_xattr");
@@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb,
1049 set_opt (sbi->s_mount_opt, DEBUG); 1047 set_opt (sbi->s_mount_opt, DEBUG);
1050 break; 1048 break;
1051 case Opt_oldalloc: 1049 case Opt_oldalloc:
1052 set_opt (sbi->s_mount_opt, OLDALLOC); 1050 ext3_msg(sb, KERN_WARNING,
1051 "Ignoring deprecated oldalloc option");
1053 break; 1052 break;
1054 case Opt_orlov: 1053 case Opt_orlov:
1055 clear_opt (sbi->s_mount_opt, OLDALLOC); 1054 ext3_msg(sb, KERN_WARNING,
1055 "Ignoring deprecated orlov option");
1056 break; 1056 break;
1057#ifdef CONFIG_EXT3_FS_XATTR 1057#ifdef CONFIG_EXT3_FS_XATTR
1058 case Opt_user_xattr: 1058 case Opt_user_xattr:
@@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2669 /* 2669 /*
2670 * If we have an unprocessed orphan list hanging 2670 * If we have an unprocessed orphan list hanging
2671 * around from a previously readonly bdev mount, 2671 * around from a previously readonly bdev mount,
2672 * require a full umount/remount for now. 2672 * require a full umount & mount for now.
2673 */ 2673 */
2674 if (es->s_last_orphan) { 2674 if (es->s_last_orphan) {
2675 ext3_msg(sb, KERN_WARNING, "warning: couldn't " 2675 ext3_msg(sb, KERN_WARNING, "warning: couldn't "
2676 "remount RDWR because of unprocessed " 2676 "remount RDWR because of unprocessed "
2677 "orphan inode list. Please " 2677 "orphan inode list. Please "
2678 "umount/remount instead."); 2678 "umount & mount instead.");
2679 err = -EINVAL; 2679 err = -EINVAL;
2680 goto restore_opts; 2680 goto restore_opts;
2681 } 2681 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224adf496..f6dba4505f1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -28,7 +28,8 @@
28 */ 28 */
29 29
30/* 30/*
31 * Calculate the block group number and offset, given a block number 31 * Calculate the block group number and offset into the block/cluster
32 * allocation bitmap, given a block number
32 */ 33 */
33void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 34void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
34 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) 35 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
37 ext4_grpblk_t offset; 38 ext4_grpblk_t offset;
38 39
39 blocknr = blocknr - le32_to_cpu(es->s_first_data_block); 40 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
40 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); 41 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
42 EXT4_SB(sb)->s_cluster_bits;
41 if (offsetp) 43 if (offsetp)
42 *offsetp = offset; 44 *offsetp = offset;
43 if (blockgrpp) 45 if (blockgrpp)
@@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55 return 0; 57 return 0;
56} 58}
57 59
58static int ext4_group_used_meta_blocks(struct super_block *sb, 60/* Return the number of clusters used for file system metadata; this
59 ext4_group_t block_group, 61 * represents the overhead needed by the file system.
60 struct ext4_group_desc *gdp) 62 */
63unsigned ext4_num_overhead_clusters(struct super_block *sb,
64 ext4_group_t block_group,
65 struct ext4_group_desc *gdp)
61{ 66{
62 ext4_fsblk_t tmp; 67 unsigned num_clusters;
68 int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
69 ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
70 ext4_fsblk_t itbl_blk;
63 struct ext4_sb_info *sbi = EXT4_SB(sb); 71 struct ext4_sb_info *sbi = EXT4_SB(sb);
64 /* block bitmap, inode bitmap, and inode table blocks */
65 int used_blocks = sbi->s_itb_per_group + 2;
66 72
67 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 73 /* This is the number of clusters used by the superblock,
68 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 74 * block group descriptors, and reserved block group
69 block_group)) 75 * descriptor blocks */
70 used_blocks--; 76 num_clusters = ext4_num_base_meta_clusters(sb, block_group);
71 77
72 if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), 78 /*
73 block_group)) 79 * For the allocation bitmaps and inode table, we first need
74 used_blocks--; 80 * to check to see if the block is in the block group. If it
75 81 * is, then check to see if the cluster is already accounted
76 tmp = ext4_inode_table(sb, gdp); 82 * for in the clusters used for the base metadata cluster, or
77 for (; tmp < ext4_inode_table(sb, gdp) + 83 * if we can increment the base metadata cluster to include
78 sbi->s_itb_per_group; tmp++) { 84 * that block. Otherwise, we will have to track the cluster
79 if (!ext4_block_in_group(sb, tmp, block_group)) 85 * used for the allocation bitmap or inode table explicitly.
80 used_blocks -= 1; 86 * Normally all of these blocks are contiguous, so the special
87 * case handling shouldn't be necessary except for *very*
88 * unusual file system layouts.
89 */
90 if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
91 block_cluster = EXT4_B2C(sbi, (start -
92 ext4_block_bitmap(sb, gdp)));
93 if (block_cluster < num_clusters)
94 block_cluster = -1;
95 else if (block_cluster == num_clusters) {
96 num_clusters++;
97 block_cluster = -1;
81 } 98 }
82 } 99 }
83 return used_blocks;
84}
85 100
86/* Initializes an uninitialized block bitmap if given, and returns the 101 if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
87 * number of blocks free in the group. */ 102 inode_cluster = EXT4_B2C(sbi,
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 103 start - ext4_inode_bitmap(sb, gdp));
89 ext4_group_t block_group, struct ext4_group_desc *gdp) 104 if (inode_cluster < num_clusters)
90{ 105 inode_cluster = -1;
91 int bit, bit_max; 106 else if (inode_cluster == num_clusters) {
92 ext4_group_t ngroups = ext4_get_groups_count(sb); 107 num_clusters++;
93 unsigned free_blocks, group_blocks; 108 inode_cluster = -1;
94 struct ext4_sb_info *sbi = EXT4_SB(sb);
95
96 if (bh) {
97 J_ASSERT_BH(bh, buffer_locked(bh));
98
99 /* If checksum is bad mark all blocks used to prevent allocation
100 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, "Checksum bad for group %u",
103 block_group);
104 ext4_free_blks_set(sb, gdp, 0);
105 ext4_free_inodes_set(sb, gdp, 0);
106 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0;
109 } 109 }
110 memset(bh->b_data, 0, sb->s_blocksize);
111 } 110 }
112 111
113 /* Check for superblock and gdt backups in this group */ 112 itbl_blk = ext4_inode_table(sb, gdp);
114 bit_max = ext4_bg_has_super(sb, block_group); 113 for (i = 0; i < sbi->s_itb_per_group; i++) {
115 114 if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
116 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || 115 c = EXT4_B2C(sbi, start - itbl_blk + i);
117 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * 116 if ((c < num_clusters) || (c == inode_cluster) ||
118 sbi->s_desc_per_block) { 117 (c == block_cluster) || (c == itbl_cluster))
119 if (bit_max) { 118 continue;
120 bit_max += ext4_bg_num_gdb(sb, block_group); 119 if (c == num_clusters) {
121 bit_max += 120 num_clusters++;
122 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 121 continue;
122 }
123 num_clusters++;
124 itbl_cluster = c;
123 } 125 }
124 } else { /* For META_BG_BLOCK_GROUPS */
125 bit_max += ext4_bg_num_gdb(sb, block_group);
126 } 126 }
127 127
128 if (block_group == ngroups - 1) { 128 if (block_cluster != -1)
129 num_clusters++;
130 if (inode_cluster != -1)
131 num_clusters++;
132
133 return num_clusters;
134}
135
136static unsigned int num_clusters_in_group(struct super_block *sb,
137 ext4_group_t block_group)
138{
139 unsigned int blocks;
140
141 if (block_group == ext4_get_groups_count(sb) - 1) {
129 /* 142 /*
130 * Even though mke2fs always initialize first and last group 143 * Even though mke2fs always initializes the first and
131 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need 144 * last group, just in case some other tool was used,
132 * to make sure we calculate the right free blocks 145 * we need to make sure we calculate the right free
146 * blocks.
133 */ 147 */
134 group_blocks = ext4_blocks_count(sbi->s_es) - 148 blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
135 ext4_group_first_block_no(sb, ngroups - 1); 149 ext4_group_first_block_no(sb, block_group);
136 } else { 150 } else
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 151 blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 152 return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
153}
139 154
140 free_blocks = group_blocks - bit_max; 155/* Initializes an uninitialized block bitmap */
156void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
157 ext4_group_t block_group,
158 struct ext4_group_desc *gdp)
159{
160 unsigned int bit, bit_max;
161 struct ext4_sb_info *sbi = EXT4_SB(sb);
162 ext4_fsblk_t start, tmp;
163 int flex_bg = 0;
164
165 J_ASSERT_BH(bh, buffer_locked(bh));
166
167 /* If checksum is bad mark all blocks used to prevent allocation
168 * essentially implementing a per-group read-only flag. */
169 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
170 ext4_error(sb, "Checksum bad for group %u", block_group);
171 ext4_free_group_clusters_set(sb, gdp, 0);
172 ext4_free_inodes_set(sb, gdp, 0);
173 ext4_itable_unused_set(sb, gdp, 0);
174 memset(bh->b_data, 0xff, sb->s_blocksize);
175 return;
176 }
177 memset(bh->b_data, 0, sb->s_blocksize);
141 178
142 if (bh) { 179 bit_max = ext4_num_base_meta_clusters(sb, block_group);
143 ext4_fsblk_t start, tmp; 180 for (bit = 0; bit < bit_max; bit++)
144 int flex_bg = 0; 181 ext4_set_bit(bit, bh->b_data);
145 182
146 for (bit = 0; bit < bit_max; bit++) 183 start = ext4_group_first_block_no(sb, block_group);
147 ext4_set_bit(bit, bh->b_data);
148 184
149 start = ext4_group_first_block_no(sb, block_group); 185 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
186 flex_bg = 1;
150 187
151 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 188 /* Set bits for block and inode bitmaps, and inode table */
152 EXT4_FEATURE_INCOMPAT_FLEX_BG)) 189 tmp = ext4_block_bitmap(sb, gdp);
153 flex_bg = 1; 190 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
191 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
154 192
155 /* Set bits for block and inode bitmaps, and inode table */ 193 tmp = ext4_inode_bitmap(sb, gdp);
156 tmp = ext4_block_bitmap(sb, gdp); 194 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
157 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) 195 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
158 ext4_set_bit(tmp - start, bh->b_data);
159 196
160 tmp = ext4_inode_bitmap(sb, gdp); 197 tmp = ext4_inode_table(sb, gdp);
198 for (; tmp < ext4_inode_table(sb, gdp) +
199 sbi->s_itb_per_group; tmp++) {
161 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) 200 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
162 ext4_set_bit(tmp - start, bh->b_data); 201 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
163
164 tmp = ext4_inode_table(sb, gdp);
165 for (; tmp < ext4_inode_table(sb, gdp) +
166 sbi->s_itb_per_group; tmp++) {
167 if (!flex_bg ||
168 ext4_block_in_group(sb, tmp, block_group))
169 ext4_set_bit(tmp - start, bh->b_data);
170 }
171 /*
172 * Also if the number of blocks within the group is
173 * less than the blocksize * 8 ( which is the size
174 * of bitmap ), set rest of the block bitmap to 1
175 */
176 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
177 bh->b_data);
178 } 202 }
179 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 203
204 /*
205 * Also if the number of blocks within the group is less than
206 * the blocksize * 8 ( which is the size of bitmap ), set rest
207 * of the block bitmap to 1
208 */
209 ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
210 sb->s_blocksize * 8, bh->b_data);
180} 211}
181 212
213/* Return the number of free blocks in a block group. It is used when
214 * the block bitmap is uninitialized, so we can't just count the bits
215 * in the bitmap. */
216unsigned ext4_free_clusters_after_init(struct super_block *sb,
217 ext4_group_t block_group,
218 struct ext4_group_desc *gdp)
219{
220 return num_clusters_in_group(sb, block_group) -
221 ext4_num_overhead_clusters(sb, block_group, gdp);
222}
182 223
183/* 224/*
184 * The free blocks are managed by bitmaps. A file system contains several 225 * The free blocks are managed by bitmaps. A file system contains several
@@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 403}
363 404
364/** 405/**
365 * ext4_has_free_blocks() 406 * ext4_has_free_clusters()
366 * @sbi: in-core super block structure. 407 * @sbi: in-core super block structure.
367 * @nblocks: number of needed blocks 408 * @nclusters: number of needed blocks
409 * @flags: flags from ext4_mb_new_blocks()
368 * 410 *
369 * Check if filesystem has nblocks free & available for allocation. 411 * Check if filesystem has nclusters free & available for allocation.
370 * On success return 1, return 0 on failure. 412 * On success return 1, return 0 on failure.
371 */ 413 */
372static int ext4_has_free_blocks(struct ext4_sb_info *sbi, 414static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags) 415 s64 nclusters, unsigned int flags)
374{ 416{
375 s64 free_blocks, dirty_blocks, root_blocks; 417 s64 free_clusters, dirty_clusters, root_clusters;
376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 418 struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
377 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; 419 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
378 420
379 free_blocks = percpu_counter_read_positive(fbc); 421 free_clusters = percpu_counter_read_positive(fcc);
380 dirty_blocks = percpu_counter_read_positive(dbc); 422 dirty_clusters = percpu_counter_read_positive(dcc);
381 root_blocks = ext4_r_blocks_count(sbi->s_es); 423 root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
382 424
383 if (free_blocks - (nblocks + root_blocks + dirty_blocks) < 425 if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
384 EXT4_FREEBLOCKS_WATERMARK) { 426 EXT4_FREECLUSTERS_WATERMARK) {
385 free_blocks = percpu_counter_sum_positive(fbc); 427 free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
386 dirty_blocks = percpu_counter_sum_positive(dbc); 428 dirty_clusters = percpu_counter_sum_positive(dcc);
387 } 429 }
388 /* Check whether we have space after 430 /* Check whether we have space after accounting for current
389 * accounting for current dirty blocks & root reserved blocks. 431 * dirty clusters & root reserved clusters.
390 */ 432 */
391 if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) 433 if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
392 return 1; 434 return 1;
393 435
394 /* Hm, nope. Are (enough) root reserved blocks available? */ 436 /* Hm, nope. Are (enough) root reserved clusters available? */
395 if (sbi->s_resuid == current_fsuid() || 437 if (sbi->s_resuid == current_fsuid() ||
396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 438 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
397 capable(CAP_SYS_RESOURCE) || 439 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 440 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399 441
400 if (free_blocks >= (nblocks + dirty_blocks)) 442 if (free_clusters >= (nclusters + dirty_clusters))
401 return 1; 443 return 1;
402 } 444 }
403 445
404 return 0; 446 return 0;
405} 447}
406 448
407int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 449int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
408 s64 nblocks, unsigned int flags) 450 s64 nclusters, unsigned int flags)
409{ 451{
410 if (ext4_has_free_blocks(sbi, nblocks, flags)) { 452 if (ext4_has_free_clusters(sbi, nclusters, flags)) {
411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 453 percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
412 return 0; 454 return 0;
413 } else 455 } else
414 return -ENOSPC; 456 return -ENOSPC;
@@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
428 */ 470 */
429int ext4_should_retry_alloc(struct super_block *sb, int *retries) 471int ext4_should_retry_alloc(struct super_block *sb, int *retries)
430{ 472{
431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || 473 if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
432 (*retries)++ > 3 || 474 (*retries)++ > 3 ||
433 !EXT4_SB(sb)->s_journal) 475 !EXT4_SB(sb)->s_journal)
434 return 0; 476 return 0;
@@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
444 * @handle: handle to this transaction 486 * @handle: handle to this transaction
445 * @inode: file inode 487 * @inode: file inode
446 * @goal: given target block(filesystem wide) 488 * @goal: given target block(filesystem wide)
447 * @count: pointer to total number of blocks needed 489 * @count: pointer to total number of clusters needed
448 * @errp: error code 490 * @errp: error code
449 * 491 *
450 * Return 1st allocated block number on success, *count stores total account 492 * Return 1st allocated block number on success, *count stores total account
@@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
476 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 518 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
477 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 519 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
478 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 520 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
479 dquot_alloc_block_nofail(inode, ar.len); 521 dquot_alloc_block_nofail(inode,
522 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
480 } 523 }
481 return ret; 524 return ret;
482} 525}
483 526
484/** 527/**
485 * ext4_count_free_blocks() -- count filesystem free blocks 528 * ext4_count_free_clusters() -- count filesystem free clusters
486 * @sb: superblock 529 * @sb: superblock
487 * 530 *
488 * Adds up the number of free blocks from each block group. 531 * Adds up the number of free clusters from each block group.
489 */ 532 */
490ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) 533ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
491{ 534{
492 ext4_fsblk_t desc_count; 535 ext4_fsblk_t desc_count;
493 struct ext4_group_desc *gdp; 536 struct ext4_group_desc *gdp;
@@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
508 gdp = ext4_get_group_desc(sb, i, NULL); 551 gdp = ext4_get_group_desc(sb, i, NULL);
509 if (!gdp) 552 if (!gdp)
510 continue; 553 continue;
511 desc_count += ext4_free_blks_count(sb, gdp); 554 desc_count += ext4_free_group_clusters(sb, gdp);
512 brelse(bitmap_bh); 555 brelse(bitmap_bh);
513 bitmap_bh = ext4_read_block_bitmap(sb, i); 556 bitmap_bh = ext4_read_block_bitmap(sb, i);
514 if (bitmap_bh == NULL) 557 if (bitmap_bh == NULL)
@@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
516 559
517 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 560 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
518 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", 561 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
519 i, ext4_free_blks_count(sb, gdp), x); 562 i, ext4_free_group_clusters(sb, gdp), x);
520 bitmap_count += x; 563 bitmap_count += x;
521 } 564 }
522 brelse(bitmap_bh); 565 brelse(bitmap_bh);
523 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" 566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
524 ", computed = %llu, %llu\n", ext4_free_blocks_count(es), 567 ", computed = %llu, %llu\n",
568 EXT4_B2C(sbi, ext4_free_blocks_count(es)),
525 desc_count, bitmap_count); 569 desc_count, bitmap_count);
526 return bitmap_count; 570 return bitmap_count;
527#else 571#else
@@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
530 gdp = ext4_get_group_desc(sb, i, NULL); 574 gdp = ext4_get_group_desc(sb, i, NULL);
531 if (!gdp) 575 if (!gdp)
532 continue; 576 continue;
533 desc_count += ext4_free_blks_count(sb, gdp); 577 desc_count += ext4_free_group_clusters(sb, gdp);
534 } 578 }
535 579
536 return desc_count; 580 return desc_count;
@@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
620 664
621} 665}
622 666
667/*
668 * This function returns the number of file system metadata clusters at
669 * the beginning of a block group, including the reserved gdt blocks.
670 */
671unsigned ext4_num_base_meta_clusters(struct super_block *sb,
672 ext4_group_t block_group)
673{
674 struct ext4_sb_info *sbi = EXT4_SB(sb);
675 unsigned num;
676
677 /* Check for superblock and gdt backups in this group */
678 num = ext4_bg_has_super(sb, block_group);
679
680 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
681 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
682 sbi->s_desc_per_block) {
683 if (num) {
684 num += ext4_bg_num_gdb(sb, block_group);
685 num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
686 }
687 } else { /* For META_BG_BLOCK_GROUPS */
688 num += ext4_bg_num_gdb(sb, block_group);
689 }
690 return EXT4_NUM_B2C(sbi, num);
691}
623/** 692/**
624 * ext4_inode_to_goal_block - return a hint for block allocation 693 * ext4_inode_to_goal_block - return a hint for block allocation
625 * @inode: inode for block allocation 694 * @inode: inode for block allocation
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066..5b0e26a1272 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
144#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) 144#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
145#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) 145#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
146#define EXT4_MAP_UNINIT (1 << BH_Uninit) 146#define EXT4_MAP_UNINIT (1 << BH_Uninit)
147/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
148 * ext4_map_blocks wants to know whether or not the underlying cluster has
149 * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
150 * the requested mapping was from previously mapped (or delayed allocated)
151 * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
152 * should never appear on buffer_head's state flags.
153 */
154#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
147#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ 155#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
148 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ 156 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
149 EXT4_MAP_UNINIT) 157 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
150 158
151struct ext4_map_blocks { 159struct ext4_map_blocks {
152 ext4_fsblk_t m_pblk; 160 ext4_fsblk_t m_pblk;
@@ -239,8 +247,11 @@ struct ext4_io_submit {
239# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 247# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
240#endif 248#endif
241#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) 249#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
250#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
251 EXT4_SB(s)->s_cluster_bits)
242#ifdef __KERNEL__ 252#ifdef __KERNEL__
243# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 253# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
254# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
244#else 255#else
245# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) 256# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
246#endif 257#endif
@@ -258,6 +269,14 @@ struct ext4_io_submit {
258#endif 269#endif
259#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) 270#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
260 271
272/* Translate a block number to a cluster number */
273#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
274/* Translate a cluster number to a block number */
275#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
276/* Translate # of blks to # of clusters */
277#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
278 (sbi)->s_cluster_bits)
279
261/* 280/*
262 * Structure of a blocks group descriptor 281 * Structure of a blocks group descriptor
263 */ 282 */
@@ -289,7 +308,7 @@ struct ext4_group_desc
289 308
290struct flex_groups { 309struct flex_groups {
291 atomic_t free_inodes; 310 atomic_t free_inodes;
292 atomic_t free_blocks; 311 atomic_t free_clusters;
293 atomic_t used_dirs; 312 atomic_t used_dirs;
294}; 313};
295 314
@@ -306,6 +325,7 @@ struct flex_groups {
306#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) 325#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
307#ifdef __KERNEL__ 326#ifdef __KERNEL__
308# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) 327# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
328# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
309# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) 329# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
310# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) 330# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
311# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) 331# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
@@ -358,8 +378,7 @@ struct flex_groups {
358 378
359/* Flags that should be inherited by new inodes from their parent. */ 379/* Flags that should be inherited by new inodes from their parent. */
360#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 380#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
361 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ 381 EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
362 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
363 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ 382 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
364 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) 383 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
365 384
@@ -520,6 +539,8 @@ struct ext4_new_group_data {
520#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 539#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
521 /* Don't normalize allocation size (used for fallocate) */ 540 /* Don't normalize allocation size (used for fallocate) */
522#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 541#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
542 /* Request will not result in inode size update (user for fallocate) */
543#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
523 544
524/* 545/*
525 * Flags used by ext4_free_blocks 546 * Flags used by ext4_free_blocks
@@ -528,6 +549,13 @@ struct ext4_new_group_data {
528#define EXT4_FREE_BLOCKS_FORGET 0x0002 549#define EXT4_FREE_BLOCKS_FORGET 0x0002
529#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 550#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
530#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 551#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
552#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
553#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
554
555/*
556 * Flags used by ext4_discard_partial_page_buffers
557 */
558#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
531 559
532/* 560/*
533 * ioctl commands 561 * ioctl commands
@@ -538,9 +566,6 @@ struct ext4_new_group_data {
538#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 566#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
539#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 567#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
540#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 568#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
541#ifdef CONFIG_JBD2_DEBUG
542#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
543#endif
544#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 569#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
545#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 570#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
546#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 571#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
@@ -563,9 +588,6 @@ struct ext4_new_group_data {
563#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 588#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
564#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 589#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
565#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) 590#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
566#ifdef CONFIG_JBD2_DEBUG
567#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
568#endif
569#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 591#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
570#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 592#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
571#endif 593#endif
@@ -837,6 +859,7 @@ struct ext4_inode_info {
837 ext4_group_t i_last_alloc_group; 859 ext4_group_t i_last_alloc_group;
838 860
839 /* allocation reservation info for delalloc */ 861 /* allocation reservation info for delalloc */
862 /* In case of bigalloc, these refer to clusters rather than blocks */
840 unsigned int i_reserved_data_blocks; 863 unsigned int i_reserved_data_blocks;
841 unsigned int i_reserved_meta_blocks; 864 unsigned int i_reserved_meta_blocks;
842 unsigned int i_allocated_meta_blocks; 865 unsigned int i_allocated_meta_blocks;
@@ -886,7 +909,6 @@ struct ext4_inode_info {
886/* 909/*
887 * Mount flags 910 * Mount flags
888 */ 911 */
889#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
890#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 912#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
891#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 913#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
892#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 914#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -918,6 +940,9 @@ struct ext4_inode_info {
918#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 940#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
919#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 941#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
920 942
943#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
944 specified delalloc */
945
921#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 946#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
922 ~EXT4_MOUNT_##opt 947 ~EXT4_MOUNT_##opt
923#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ 948#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -968,9 +993,9 @@ struct ext4_super_block {
968/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ 993/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
969 __le32 s_first_data_block; /* First Data Block */ 994 __le32 s_first_data_block; /* First Data Block */
970 __le32 s_log_block_size; /* Block size */ 995 __le32 s_log_block_size; /* Block size */
971 __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ 996 __le32 s_log_cluster_size; /* Allocation cluster size */
972/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ 997/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
973 __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ 998 __le32 s_clusters_per_group; /* # Clusters per group */
974 __le32 s_inodes_per_group; /* # Inodes per group */ 999 __le32 s_inodes_per_group; /* # Inodes per group */
975 __le32 s_mtime; /* Mount time */ 1000 __le32 s_mtime; /* Mount time */
976/*30*/ __le32 s_wtime; /* Write time */ 1001/*30*/ __le32 s_wtime; /* Write time */
@@ -1066,7 +1091,10 @@ struct ext4_super_block {
1066 __u8 s_last_error_func[32]; /* function where the error happened */ 1091 __u8 s_last_error_func[32]; /* function where the error happened */
1067#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) 1092#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
1068 __u8 s_mount_opts[64]; 1093 __u8 s_mount_opts[64];
1069 __le32 s_reserved[112]; /* Padding to the end of the block */ 1094 __le32 s_usr_quota_inum; /* inode for tracking user quota */
1095 __le32 s_grp_quota_inum; /* inode for tracking group quota */
1096 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
1097 __le32 s_reserved[109]; /* Padding to the end of the block */
1070}; 1098};
1071 1099
1072#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) 1100#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1086,6 +1114,7 @@ struct ext4_sb_info {
1086 unsigned long s_desc_size; /* Size of a group descriptor in bytes */ 1114 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
1087 unsigned long s_inodes_per_block;/* Number of inodes per block */ 1115 unsigned long s_inodes_per_block;/* Number of inodes per block */
1088 unsigned long s_blocks_per_group;/* Number of blocks in a group */ 1116 unsigned long s_blocks_per_group;/* Number of blocks in a group */
1117 unsigned long s_clusters_per_group; /* Number of clusters in a group */
1089 unsigned long s_inodes_per_group;/* Number of inodes in a group */ 1118 unsigned long s_inodes_per_group;/* Number of inodes in a group */
1090 unsigned long s_itb_per_group; /* Number of inode table blocks per group */ 1119 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
1091 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 1120 unsigned long s_gdb_count; /* Number of group descriptor blocks */
@@ -1094,6 +1123,8 @@ struct ext4_sb_info {
1094 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ 1123 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
1095 unsigned long s_overhead_last; /* Last calculated overhead */ 1124 unsigned long s_overhead_last; /* Last calculated overhead */
1096 unsigned long s_blocks_last; /* Last seen block count */ 1125 unsigned long s_blocks_last; /* Last seen block count */
1126 unsigned int s_cluster_ratio; /* Number of blocks per cluster */
1127 unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
1097 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 1128 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
1098 struct buffer_head * s_sbh; /* Buffer containing the super block */ 1129 struct buffer_head * s_sbh; /* Buffer containing the super block */
1099 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1130 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
@@ -1117,10 +1148,10 @@ struct ext4_sb_info {
1117 u32 s_hash_seed[4]; 1148 u32 s_hash_seed[4];
1118 int s_def_hash_version; 1149 int s_def_hash_version;
1119 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ 1150 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
1120 struct percpu_counter s_freeblocks_counter; 1151 struct percpu_counter s_freeclusters_counter;
1121 struct percpu_counter s_freeinodes_counter; 1152 struct percpu_counter s_freeinodes_counter;
1122 struct percpu_counter s_dirs_counter; 1153 struct percpu_counter s_dirs_counter;
1123 struct percpu_counter s_dirtyblocks_counter; 1154 struct percpu_counter s_dirtyclusters_counter;
1124 struct blockgroup_lock *s_blockgroup_lock; 1155 struct blockgroup_lock *s_blockgroup_lock;
1125 struct proc_dir_entry *s_proc; 1156 struct proc_dir_entry *s_proc;
1126 struct kobject s_kobj; 1157 struct kobject s_kobj;
@@ -1136,10 +1167,6 @@ struct ext4_sb_info {
1136 u32 s_max_batch_time; 1167 u32 s_max_batch_time;
1137 u32 s_min_batch_time; 1168 u32 s_min_batch_time;
1138 struct block_device *journal_bdev; 1169 struct block_device *journal_bdev;
1139#ifdef CONFIG_JBD2_DEBUG
1140 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
1141 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
1142#endif
1143#ifdef CONFIG_QUOTA 1170#ifdef CONFIG_QUOTA
1144 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 1171 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
1145 int s_jquota_fmt; /* Format of quota to use */ 1172 int s_jquota_fmt; /* Format of quota to use */
@@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1248 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1275 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1249} 1276}
1250 1277
1278static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1279 struct ext4_io_end *io_end)
1280{
1281 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1282 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1283 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
1284 }
1285}
1286
1251/* 1287/*
1252 * Inode dynamic state flags 1288 * Inode dynamic state flags
1253 */ 1289 */
@@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1360#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1396#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1361#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1397#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1362#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 1398#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1399#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
1363 1400
1364#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1401#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1365#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1402#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1402 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ 1439 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
1403 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ 1440 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
1404 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ 1441 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
1405 EXT4_FEATURE_RO_COMPAT_HUGE_FILE) 1442 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
1443 EXT4_FEATURE_RO_COMPAT_BIGALLOC)
1406 1444
1407/* 1445/*
1408 * Default values for user and/or group using reserved blocks 1446 * Default values for user and/or group using reserved blocks
@@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1735 unsigned int flags, 1773 unsigned int flags,
1736 unsigned long *count, 1774 unsigned long *count,
1737 int *errp); 1775 int *errp);
1738extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 1776extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
1739 s64 nblocks, unsigned int flags); 1777 s64 nclusters, unsigned int flags);
1740extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1778extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
1741extern void ext4_check_blocks_bitmap(struct super_block *); 1779extern void ext4_check_blocks_bitmap(struct super_block *);
1742extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1780extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1743 ext4_group_t block_group, 1781 ext4_group_t block_group,
@@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1745extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1783extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1746struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, 1784struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1747 ext4_group_t block_group); 1785 ext4_group_t block_group);
1748extern unsigned ext4_init_block_bitmap(struct super_block *sb, 1786extern void ext4_init_block_bitmap(struct super_block *sb,
1749 struct buffer_head *bh, 1787 struct buffer_head *bh,
1750 ext4_group_t group, 1788 ext4_group_t group,
1751 struct ext4_group_desc *desc); 1789 struct ext4_group_desc *desc);
1752#define ext4_free_blocks_after_init(sb, group, desc) \ 1790extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
1753 ext4_init_block_bitmap(sb, NULL, group, desc) 1791 ext4_group_t block_group,
1792 struct ext4_group_desc *gdp);
1793extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
1794 ext4_group_t block_group);
1795extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
1796 ext4_group_t block_group,
1797 struct ext4_group_desc *gdp);
1754ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 1798ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1755 1799
1756/* dir.c */ 1800/* dir.c */
@@ -1776,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
1776 1820
1777/* ialloc.c */ 1821/* ialloc.c */
1778extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, 1822extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
1779 const struct qstr *qstr, __u32 goal); 1823 const struct qstr *qstr, __u32 goal,
1824 uid_t *owner);
1780extern void ext4_free_inode(handle_t *, struct inode *); 1825extern void ext4_free_inode(handle_t *, struct inode *);
1781extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); 1826extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1782extern unsigned long ext4_count_free_inodes(struct super_block *); 1827extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
1839 struct address_space *mapping, loff_t from); 1884 struct address_space *mapping, loff_t from);
1840extern int ext4_block_zero_page_range(handle_t *handle, 1885extern int ext4_block_zero_page_range(handle_t *handle,
1841 struct address_space *mapping, loff_t from, loff_t length); 1886 struct address_space *mapping, loff_t from, loff_t length);
1887extern int ext4_discard_partial_page_buffers(handle_t *handle,
1888 struct address_space *mapping, loff_t from,
1889 loff_t length, int flags);
1890extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
1891 struct inode *inode, struct page *page, loff_t from,
1892 loff_t length, int flags);
1842extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1893extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1843extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1894extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1844extern void ext4_da_update_reserve_space(struct inode *inode, 1895extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1878,40 +1929,40 @@ extern int ext4_group_extend(struct super_block *sb,
1878extern void *ext4_kvmalloc(size_t size, gfp_t flags); 1929extern void *ext4_kvmalloc(size_t size, gfp_t flags);
1879extern void *ext4_kvzalloc(size_t size, gfp_t flags); 1930extern void *ext4_kvzalloc(size_t size, gfp_t flags);
1880extern void ext4_kvfree(void *ptr); 1931extern void ext4_kvfree(void *ptr);
1881extern void __ext4_error(struct super_block *, const char *, unsigned int, 1932extern __printf(4, 5)
1882 const char *, ...) 1933void __ext4_error(struct super_block *, const char *, unsigned int,
1883 __attribute__ ((format (printf, 4, 5))); 1934 const char *, ...);
1884#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ 1935#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
1885 __LINE__, ## message) 1936 __LINE__, ## message)
1886extern void ext4_error_inode(struct inode *, const char *, unsigned int, 1937extern __printf(5, 6)
1887 ext4_fsblk_t, const char *, ...) 1938void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
1888 __attribute__ ((format (printf, 5, 6))); 1939 const char *, ...);
1889extern void ext4_error_file(struct file *, const char *, unsigned int, 1940extern __printf(5, 6)
1890 ext4_fsblk_t, const char *, ...) 1941void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
1891 __attribute__ ((format (printf, 5, 6))); 1942 const char *, ...);
1892extern void __ext4_std_error(struct super_block *, const char *, 1943extern void __ext4_std_error(struct super_block *, const char *,
1893 unsigned int, int); 1944 unsigned int, int);
1894extern void __ext4_abort(struct super_block *, const char *, unsigned int, 1945extern __printf(4, 5)
1895 const char *, ...) 1946void __ext4_abort(struct super_block *, const char *, unsigned int,
1896 __attribute__ ((format (printf, 4, 5))); 1947 const char *, ...);
1897#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ 1948#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
1898 __LINE__, ## message) 1949 __LINE__, ## message)
1899extern void __ext4_warning(struct super_block *, const char *, unsigned int, 1950extern __printf(4, 5)
1900 const char *, ...) 1951void __ext4_warning(struct super_block *, const char *, unsigned int,
1901 __attribute__ ((format (printf, 4, 5))); 1952 const char *, ...);
1902#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ 1953#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
1903 __LINE__, ## message) 1954 __LINE__, ## message)
1904extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1955extern __printf(3, 4)
1905 __attribute__ ((format (printf, 3, 4))); 1956void ext4_msg(struct super_block *, const char *, const char *, ...);
1906extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 1957extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1907 const char *, unsigned int, const char *); 1958 const char *, unsigned int, const char *);
1908#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ 1959#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1909 __LINE__, msg) 1960 __LINE__, msg)
1910extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1961extern __printf(7, 8)
1911 struct super_block *, ext4_group_t, \ 1962void __ext4_grp_locked_error(const char *, unsigned int,
1912 unsigned long, ext4_fsblk_t, \ 1963 struct super_block *, ext4_group_t,
1913 const char *, ...) 1964 unsigned long, ext4_fsblk_t,
1914 __attribute__ ((format (printf, 7, 8))); 1965 const char *, ...);
1915#define ext4_grp_locked_error(sb, grp, message...) \ 1966#define ext4_grp_locked_error(sb, grp, message...) \
1916 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 1967 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
1917extern void ext4_update_dynamic_rev(struct super_block *sb); 1968extern void ext4_update_dynamic_rev(struct super_block *sb);
@@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1927 struct ext4_group_desc *bg); 1978 struct ext4_group_desc *bg);
1928extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1979extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1929 struct ext4_group_desc *bg); 1980 struct ext4_group_desc *bg);
1930extern __u32 ext4_free_blks_count(struct super_block *sb, 1981extern __u32 ext4_free_group_clusters(struct super_block *sb,
1931 struct ext4_group_desc *bg); 1982 struct ext4_group_desc *bg);
1932extern __u32 ext4_free_inodes_count(struct super_block *sb, 1983extern __u32 ext4_free_inodes_count(struct super_block *sb,
1933 struct ext4_group_desc *bg); 1984 struct ext4_group_desc *bg);
1934extern __u32 ext4_used_dirs_count(struct super_block *sb, 1985extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
1941 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1992 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1942extern void ext4_inode_table_set(struct super_block *sb, 1993extern void ext4_inode_table_set(struct super_block *sb,
1943 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1994 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1944extern void ext4_free_blks_set(struct super_block *sb, 1995extern void ext4_free_group_clusters_set(struct super_block *sb,
1945 struct ext4_group_desc *bg, __u32 count); 1996 struct ext4_group_desc *bg,
1997 __u32 count);
1946extern void ext4_free_inodes_set(struct super_block *sb, 1998extern void ext4_free_inodes_set(struct super_block *sb,
1947 struct ext4_group_desc *bg, __u32 count); 1999 struct ext4_group_desc *bg, __u32 count);
1948extern void ext4_used_dirs_set(struct super_block *sb, 2000extern void ext4_used_dirs_set(struct super_block *sb,
@@ -2051,13 +2103,13 @@ do { \
2051} while (0) 2103} while (0)
2052 2104
2053#ifdef CONFIG_SMP 2105#ifdef CONFIG_SMP
2054/* Each CPU can accumulate percpu_counter_batch blocks in their local 2106/* Each CPU can accumulate percpu_counter_batch clusters in their local
2055 * counters. So we need to make sure we have free blocks more 2107 * counters. So we need to make sure we have free clusters more
2056 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. 2108 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
2057 */ 2109 */
2058#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) 2110#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
2059#else 2111#else
2060#define EXT4_FREEBLOCKS_WATERMARK 0 2112#define EXT4_FREECLUSTERS_WATERMARK 0
2061#endif 2113#endif
2062 2114
2063static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) 2115static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
@@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2243enum ext4_state_bits { 2295enum ext4_state_bits {
2244 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2296 BH_Uninit /* blocks are allocated but uninitialized on disk */
2245 = BH_JBDPrivateStart, 2297 = BH_JBDPrivateStart,
2298 BH_AllocFromCluster, /* allocated blocks were part of already
2299 * allocated cluster. Note that this flag will
2300 * never, ever appear in a buffer_head's state
2301 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2302 * this is used. */
2303 BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
2304 * flag is set when ext4_map_blocks is called on a
2305 * delayed allocated block to get its real mapping. */
2246}; 2306};
2247 2307
2248BUFFER_FNS(Uninit, uninit) 2308BUFFER_FNS(Uninit, uninit)
2249TAS_BUFFER_FNS(Uninit, uninit) 2309TAS_BUFFER_FNS(Uninit, uninit)
2310BUFFER_FNS(Da_Mapped, da_mapped)
2250 2311
2251/* 2312/*
2252 * Add new method to test wether block and inode bitmaps are properly 2313 * Add new method to test wether block and inode bitmaps are properly
@@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb);
2282 2343
2283#endif /* __KERNEL__ */ 2344#endif /* __KERNEL__ */
2284 2345
2346#include "ext4_extents.h"
2347
2285#endif /* _EXT4_H */ 2348#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b61..a52db3a69a3 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
290 struct ext4_ext_path *); 290 struct ext4_ext_path *);
291extern void ext4_ext_drop_refs(struct ext4_ext_path *); 291extern void ext4_ext_drop_refs(struct ext4_ext_path *);
292extern int ext4_ext_check_inode(struct inode *inode); 292extern int ext4_ext_check_inode(struct inode *inode);
293extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
294 int search_hint_reverse);
293#endif /* _EXT4_EXTENTS */ 295#endif /* _EXT4_EXTENTS */
294 296
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa1560..aca17901758 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
109 109
110 if (ext4_handle_valid(handle)) { 110 if (ext4_handle_valid(handle)) {
111 err = jbd2_journal_dirty_metadata(handle, bh); 111 err = jbd2_journal_dirty_metadata(handle, bh);
112 if (err) 112 if (err) {
113 ext4_journal_abort_handle(where, line, __func__, 113 /* Errors can only happen if there is a bug */
114 bh, handle, err); 114 handle->h_err = err;
115 __ext4_journal_stop(where, line, handle);
116 }
115 } else { 117 } else {
116 if (inode) 118 if (inode)
117 mark_buffer_dirty_inode(bh, inode); 119 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98a..61fa9e1614a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h> 43#include <linux/fiemap.h>
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46 45
47#include <trace/events/ext4.h> 46#include <trace/events/ext4.h>
48 47
@@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
96 * - ENOMEM 95 * - ENOMEM
97 * - EIO 96 * - EIO
98 */ 97 */
99static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 98#define ext4_ext_dirty(handle, inode, path) \
100 struct ext4_ext_path *path) 99 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101 handle_t *handle, struct inode *inode,
102 struct ext4_ext_path *path)
101{ 103{
102 int err; 104 int err;
103 if (path->p_bh) { 105 if (path->p_bh) {
104 /* path points to block */ 106 /* path points to block */
105 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 107 err = __ext4_handle_dirty_metadata(where, line, handle,
108 inode, path->p_bh);
106 } else { 109 } else {
107 /* path points to leaf/index in inode body */ 110 /* path points to leaf/index in inode body */
108 err = ext4_mark_inode_dirty(handle, inode); 111 err = ext4_mark_inode_dirty(handle, inode);
@@ -114,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
114 struct ext4_ext_path *path, 117 struct ext4_ext_path *path,
115 ext4_lblk_t block) 118 ext4_lblk_t block)
116{ 119{
117 int depth;
118
119 if (path) { 120 if (path) {
121 int depth = path->p_depth;
120 struct ext4_extent *ex; 122 struct ext4_extent *ex;
121 depth = path->p_depth;
122 123
123 /* 124 /*
124 * Try to predict block placement assuming that we are 125 * Try to predict block placement assuming that we are
@@ -180,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
180 181
181 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 182 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
182 / sizeof(struct ext4_extent); 183 / sizeof(struct ext4_extent);
183 if (!check) {
184#ifdef AGGRESSIVE_TEST 184#ifdef AGGRESSIVE_TEST
185 if (size > 6) 185 if (!check && size > 6)
186 size = 6; 186 size = 6;
187#endif 187#endif
188 }
189 return size; 188 return size;
190} 189}
191 190
@@ -195,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
195 194
196 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 195 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
197 / sizeof(struct ext4_extent_idx); 196 / sizeof(struct ext4_extent_idx);
198 if (!check) {
199#ifdef AGGRESSIVE_TEST 197#ifdef AGGRESSIVE_TEST
200 if (size > 5) 198 if (!check && size > 5)
201 size = 5; 199 size = 5;
202#endif 200#endif
203 }
204 return size; 201 return size;
205} 202}
206 203
@@ -211,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
211 size = sizeof(EXT4_I(inode)->i_data); 208 size = sizeof(EXT4_I(inode)->i_data);
212 size -= sizeof(struct ext4_extent_header); 209 size -= sizeof(struct ext4_extent_header);
213 size /= sizeof(struct ext4_extent); 210 size /= sizeof(struct ext4_extent);
214 if (!check) {
215#ifdef AGGRESSIVE_TEST 211#ifdef AGGRESSIVE_TEST
216 if (size > 3) 212 if (!check && size > 3)
217 size = 3; 213 size = 3;
218#endif 214#endif
219 }
220 return size; 215 return size;
221} 216}
222 217
@@ -227,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
227 size = sizeof(EXT4_I(inode)->i_data); 222 size = sizeof(EXT4_I(inode)->i_data);
228 size -= sizeof(struct ext4_extent_header); 223 size -= sizeof(struct ext4_extent_header);
229 size /= sizeof(struct ext4_extent_idx); 224 size /= sizeof(struct ext4_extent_idx);
230 if (!check) {
231#ifdef AGGRESSIVE_TEST 225#ifdef AGGRESSIVE_TEST
232 if (size > 4) 226 if (!check && size > 4)
233 size = 4; 227 size = 4;
234#endif 228#endif
235 }
236 return size; 229 return size;
237} 230}
238 231
@@ -244,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
244int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 237int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
245{ 238{
246 struct ext4_inode_info *ei = EXT4_I(inode); 239 struct ext4_inode_info *ei = EXT4_I(inode);
247 int idxs, num = 0; 240 int idxs;
248 241
249 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 242 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
250 / sizeof(struct ext4_extent_idx)); 243 / sizeof(struct ext4_extent_idx));
@@ -259,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
259 */ 252 */
260 if (ei->i_da_metadata_calc_len && 253 if (ei->i_da_metadata_calc_len &&
261 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 254 ei->i_da_metadata_calc_last_lblock+1 == lblock) {
255 int num = 0;
256
262 if ((ei->i_da_metadata_calc_len % idxs) == 0) 257 if ((ei->i_da_metadata_calc_len % idxs) == 0)
263 num++; 258 num++;
264 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 259 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -321,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
321 struct ext4_extent_header *eh, 316 struct ext4_extent_header *eh,
322 int depth) 317 int depth)
323{ 318{
324 struct ext4_extent *ext;
325 struct ext4_extent_idx *ext_idx;
326 unsigned short entries; 319 unsigned short entries;
327 if (eh->eh_entries == 0) 320 if (eh->eh_entries == 0)
328 return 1; 321 return 1;
@@ -331,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
331 324
332 if (depth == 0) { 325 if (depth == 0) {
333 /* leaf entries */ 326 /* leaf entries */
334 ext = EXT_FIRST_EXTENT(eh); 327 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
335 while (entries) { 328 while (entries) {
336 if (!ext4_valid_extent(inode, ext)) 329 if (!ext4_valid_extent(inode, ext))
337 return 0; 330 return 0;
@@ -339,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
339 entries--; 332 entries--;
340 } 333 }
341 } else { 334 } else {
342 ext_idx = EXT_FIRST_INDEX(eh); 335 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
343 while (entries) { 336 while (entries) {
344 if (!ext4_valid_extent_idx(inode, ext_idx)) 337 if (!ext4_valid_extent_idx(inode, ext_idx))
345 return 0; 338 return 0;
@@ -751,31 +744,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
751 return -EIO; 744 return -EIO;
752 } 745 }
753 746
754 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
755 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 747 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
756 /* insert after */ 748 /* insert after */
757 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 749 ext_debug("insert new index %d after: %llu\n", logical, ptr);
758 len = (len - 1) * sizeof(struct ext4_extent_idx);
759 len = len < 0 ? 0 : len;
760 ext_debug("insert new index %d after: %llu. "
761 "move %d from 0x%p to 0x%p\n",
762 logical, ptr, len,
763 (curp->p_idx + 1), (curp->p_idx + 2));
764 memmove(curp->p_idx + 2, curp->p_idx + 1, len);
765 }
766 ix = curp->p_idx + 1; 750 ix = curp->p_idx + 1;
767 } else { 751 } else {
768 /* insert before */ 752 /* insert before */
769 len = len * sizeof(struct ext4_extent_idx); 753 ext_debug("insert new index %d before: %llu\n", logical, ptr);
770 len = len < 0 ? 0 : len;
771 ext_debug("insert new index %d before: %llu. "
772 "move %d from 0x%p to 0x%p\n",
773 logical, ptr, len,
774 curp->p_idx, (curp->p_idx + 1));
775 memmove(curp->p_idx + 1, curp->p_idx, len);
776 ix = curp->p_idx; 754 ix = curp->p_idx;
777 } 755 }
778 756
757 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
758 BUG_ON(len < 0);
759 if (len > 0) {
760 ext_debug("insert new index %d: "
761 "move %d indices from 0x%p to 0x%p\n",
762 logical, len, ix, ix + 1);
763 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
764 }
765
766 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
767 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
768 return -EIO;
769 }
770
779 ix->ei_block = cpu_to_le32(logical); 771 ix->ei_block = cpu_to_le32(logical);
780 ext4_idx_store_pblock(ix, ptr); 772 ext4_idx_store_pblock(ix, ptr);
781 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 773 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1042,16 +1034,14 @@ cleanup:
1042 */ 1034 */
1043static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1035static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1044 unsigned int flags, 1036 unsigned int flags,
1045 struct ext4_ext_path *path,
1046 struct ext4_extent *newext) 1037 struct ext4_extent *newext)
1047{ 1038{
1048 struct ext4_ext_path *curp = path;
1049 struct ext4_extent_header *neh; 1039 struct ext4_extent_header *neh;
1050 struct buffer_head *bh; 1040 struct buffer_head *bh;
1051 ext4_fsblk_t newblock; 1041 ext4_fsblk_t newblock;
1052 int err = 0; 1042 int err = 0;
1053 1043
1054 newblock = ext4_ext_new_meta_block(handle, inode, path, 1044 newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1055 newext, &err, flags); 1045 newext, &err, flags);
1056 if (newblock == 0) 1046 if (newblock == 0)
1057 return err; 1047 return err;
@@ -1071,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1071 } 1061 }
1072 1062
1073 /* move top-level index/leaf into new block */ 1063 /* move top-level index/leaf into new block */
1074 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1064 memmove(bh->b_data, EXT4_I(inode)->i_data,
1065 sizeof(EXT4_I(inode)->i_data));
1075 1066
1076 /* set size of new block */ 1067 /* set size of new block */
1077 neh = ext_block_hdr(bh); 1068 neh = ext_block_hdr(bh);
@@ -1089,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1089 if (err) 1080 if (err)
1090 goto out; 1081 goto out;
1091 1082
1092 /* create index in new top-level index: num,max,pointer */ 1083 /* Update top-level index: num,max,pointer */
1093 err = ext4_ext_get_access(handle, inode, curp);
1094 if (err)
1095 goto out;
1096
1097 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1098 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1099 curp->p_hdr->eh_entries = cpu_to_le16(1);
1100 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1101
1102 if (path[0].p_hdr->eh_depth)
1103 curp->p_idx->ei_block =
1104 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1105 else
1106 curp->p_idx->ei_block =
1107 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1108 ext4_idx_store_pblock(curp->p_idx, newblock);
1109
1110 neh = ext_inode_hdr(inode); 1084 neh = ext_inode_hdr(inode);
1085 neh->eh_entries = cpu_to_le16(1);
1086 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1087 if (neh->eh_depth == 0) {
1088 /* Root extent block becomes index block */
1089 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1090 EXT_FIRST_INDEX(neh)->ei_block =
1091 EXT_FIRST_EXTENT(neh)->ee_block;
1092 }
1111 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1093 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1112 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1094 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1113 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1095 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1114 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1096 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1115 1097
1116 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1098 neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1117 err = ext4_ext_dirty(handle, inode, curp); 1099 ext4_mark_inode_dirty(handle, inode);
1118out: 1100out:
1119 brelse(bh); 1101 brelse(bh);
1120 1102
@@ -1162,8 +1144,7 @@ repeat:
1162 err = PTR_ERR(path); 1144 err = PTR_ERR(path);
1163 } else { 1145 } else {
1164 /* tree is full, time to grow in depth */ 1146 /* tree is full, time to grow in depth */
1165 err = ext4_ext_grow_indepth(handle, inode, flags, 1147 err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1166 path, newext);
1167 if (err) 1148 if (err)
1168 goto out; 1149 goto out;
1169 1150
@@ -1235,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode,
1235 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1216 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1236 EXT4_ERROR_INODE(inode, 1217 EXT4_ERROR_INODE(inode,
1237 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1218 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1238 ix != NULL ? ix->ei_block : 0, 1219 ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1239 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1220 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1240 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1221 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1241 depth); 1222 depth);
1242 return -EIO; 1223 return -EIO;
1243 } 1224 }
@@ -1260,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode,
1260/* 1241/*
1261 * search the closest allocated block to the right for *logical 1242 * search the closest allocated block to the right for *logical
1262 * and returns it at @logical + it's physical address at @phys 1243 * and returns it at @logical + it's physical address at @phys
1263 * if *logical is the smallest allocated block, the function 1244 * if *logical is the largest allocated block, the function
1264 * returns 0 at @phys 1245 * returns 0 at @phys
1265 * return value contains 0 (success) or error code 1246 * return value contains 0 (success) or error code
1266 */ 1247 */
1267static int ext4_ext_search_right(struct inode *inode, 1248static int ext4_ext_search_right(struct inode *inode,
1268 struct ext4_ext_path *path, 1249 struct ext4_ext_path *path,
1269 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1250 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1251 struct ext4_extent **ret_ex)
1270{ 1252{
1271 struct buffer_head *bh = NULL; 1253 struct buffer_head *bh = NULL;
1272 struct ext4_extent_header *eh; 1254 struct ext4_extent_header *eh;
@@ -1308,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode,
1308 return -EIO; 1290 return -EIO;
1309 } 1291 }
1310 } 1292 }
1311 *logical = le32_to_cpu(ex->ee_block); 1293 goto found_extent;
1312 *phys = ext4_ext_pblock(ex);
1313 return 0;
1314 } 1294 }
1315 1295
1316 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1296 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1323,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode,
1323 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1303 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1324 /* next allocated block in this leaf */ 1304 /* next allocated block in this leaf */
1325 ex++; 1305 ex++;
1326 *logical = le32_to_cpu(ex->ee_block); 1306 goto found_extent;
1327 *phys = ext4_ext_pblock(ex);
1328 return 0;
1329 } 1307 }
1330 1308
1331 /* go up and search for index to the right */ 1309 /* go up and search for index to the right */
@@ -1368,9 +1346,12 @@ got_index:
1368 return -EIO; 1346 return -EIO;
1369 } 1347 }
1370 ex = EXT_FIRST_EXTENT(eh); 1348 ex = EXT_FIRST_EXTENT(eh);
1349found_extent:
1371 *logical = le32_to_cpu(ex->ee_block); 1350 *logical = le32_to_cpu(ex->ee_block);
1372 *phys = ext4_ext_pblock(ex); 1351 *phys = ext4_ext_pblock(ex);
1373 put_bh(bh); 1352 *ret_ex = ex;
1353 if (bh)
1354 put_bh(bh);
1374 return 0; 1355 return 0;
1375} 1356}
1376 1357
@@ -1395,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1395 while (depth >= 0) { 1376 while (depth >= 0) {
1396 if (depth == path->p_depth) { 1377 if (depth == path->p_depth) {
1397 /* leaf */ 1378 /* leaf */
1398 if (path[depth].p_ext != 1379 if (path[depth].p_ext &&
1380 path[depth].p_ext !=
1399 EXT_LAST_EXTENT(path[depth].p_hdr)) 1381 EXT_LAST_EXTENT(path[depth].p_hdr))
1400 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1382 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1401 } else { 1383 } else {
@@ -1623,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1623 * such that there will be no overlap, and then returns 1. 1605 * such that there will be no overlap, and then returns 1.
1624 * If there is no overlap found, it returns 0. 1606 * If there is no overlap found, it returns 0.
1625 */ 1607 */
1626static unsigned int ext4_ext_check_overlap(struct inode *inode, 1608static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1609 struct inode *inode,
1627 struct ext4_extent *newext, 1610 struct ext4_extent *newext,
1628 struct ext4_ext_path *path) 1611 struct ext4_ext_path *path)
1629{ 1612{
@@ -1637,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
1637 if (!path[depth].p_ext) 1620 if (!path[depth].p_ext)
1638 goto out; 1621 goto out;
1639 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1622 b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1623 b2 &= ~(sbi->s_cluster_ratio - 1);
1640 1624
1641 /* 1625 /*
1642 * get the next allocated block if the extent in the path 1626 * get the next allocated block if the extent in the path
@@ -1646,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
1646 b2 = ext4_ext_next_allocated_block(path); 1630 b2 = ext4_ext_next_allocated_block(path);
1647 if (b2 == EXT_MAX_BLOCKS) 1631 if (b2 == EXT_MAX_BLOCKS)
1648 goto out; 1632 goto out;
1633 b2 &= ~(sbi->s_cluster_ratio - 1);
1649 } 1634 }
1650 1635
1651 /* check for wrap through zero on extent logical start block*/ 1636 /* check for wrap through zero on extent logical start block*/
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1697 /* try to insert block into found extent and return */ 1682 /* try to insert block into found extent and return */
1698 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1683 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1699 && ext4_can_extents_be_merged(inode, ex, newext)) { 1684 && ext4_can_extents_be_merged(inode, ex, newext)) {
1700 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1685 ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1701 ext4_ext_is_uninitialized(newext), 1686 ext4_ext_is_uninitialized(newext),
1702 ext4_ext_get_actual_len(newext), 1687 ext4_ext_get_actual_len(newext),
1703 le32_to_cpu(ex->ee_block), 1688 le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1735 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 1720 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1736 next = ext4_ext_next_leaf_block(path); 1721 next = ext4_ext_next_leaf_block(path);
1737 if (next != EXT_MAX_BLOCKS) { 1722 if (next != EXT_MAX_BLOCKS) {
1738 ext_debug("next leaf block - %d\n", next); 1723 ext_debug("next leaf block - %u\n", next);
1739 BUG_ON(npath != NULL); 1724 BUG_ON(npath != NULL);
1740 npath = ext4_ext_find_extent(inode, next, NULL); 1725 npath = ext4_ext_find_extent(inode, next, NULL);
1741 if (IS_ERR(npath)) 1726 if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
1773 1758
1774 if (!nearex) { 1759 if (!nearex) {
1775 /* there is no extent in this leaf, create first one */ 1760 /* there is no extent in this leaf, create first one */
1776 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1761 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
1777 le32_to_cpu(newext->ee_block), 1762 le32_to_cpu(newext->ee_block),
1778 ext4_ext_pblock(newext), 1763 ext4_ext_pblock(newext),
1779 ext4_ext_is_uninitialized(newext), 1764 ext4_ext_is_uninitialized(newext),
1780 ext4_ext_get_actual_len(newext)); 1765 ext4_ext_get_actual_len(newext));
1781 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1766 nearex = EXT_FIRST_EXTENT(eh);
1782 } else if (le32_to_cpu(newext->ee_block) 1767 } else {
1768 if (le32_to_cpu(newext->ee_block)
1783 > le32_to_cpu(nearex->ee_block)) { 1769 > le32_to_cpu(nearex->ee_block)) {
1784/* BUG_ON(newext->ee_block == nearex->ee_block); */ 1770 /* Insert after */
1785 if (nearex != EXT_LAST_EXTENT(eh)) { 1771 ext_debug("insert %u:%llu:[%d]%d before: "
1786 len = EXT_MAX_EXTENT(eh) - nearex; 1772 "nearest %p\n",
1787 len = (len - 1) * sizeof(struct ext4_extent);
1788 len = len < 0 ? 0 : len;
1789 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1790 "move %d from 0x%p to 0x%p\n",
1791 le32_to_cpu(newext->ee_block), 1773 le32_to_cpu(newext->ee_block),
1792 ext4_ext_pblock(newext), 1774 ext4_ext_pblock(newext),
1793 ext4_ext_is_uninitialized(newext), 1775 ext4_ext_is_uninitialized(newext),
1794 ext4_ext_get_actual_len(newext), 1776 ext4_ext_get_actual_len(newext),
1795 nearex, len, nearex + 1, nearex + 2); 1777 nearex);
1796 memmove(nearex + 2, nearex + 1, len); 1778 nearex++;
1779 } else {
1780 /* Insert before */
1781 BUG_ON(newext->ee_block == nearex->ee_block);
1782 ext_debug("insert %u:%llu:[%d]%d after: "
1783 "nearest %p\n",
1784 le32_to_cpu(newext->ee_block),
1785 ext4_ext_pblock(newext),
1786 ext4_ext_is_uninitialized(newext),
1787 ext4_ext_get_actual_len(newext),
1788 nearex);
1789 }
1790 len = EXT_LAST_EXTENT(eh) - nearex + 1;
1791 if (len > 0) {
1792 ext_debug("insert %u:%llu:[%d]%d: "
1793 "move %d extents from 0x%p to 0x%p\n",
1794 le32_to_cpu(newext->ee_block),
1795 ext4_ext_pblock(newext),
1796 ext4_ext_is_uninitialized(newext),
1797 ext4_ext_get_actual_len(newext),
1798 len, nearex, nearex + 1);
1799 memmove(nearex + 1, nearex,
1800 len * sizeof(struct ext4_extent));
1797 } 1801 }
1798 path[depth].p_ext = nearex + 1;
1799 } else {
1800 BUG_ON(newext->ee_block == nearex->ee_block);
1801 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1802 len = len < 0 ? 0 : len;
1803 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1804 "move %d from 0x%p to 0x%p\n",
1805 le32_to_cpu(newext->ee_block),
1806 ext4_ext_pblock(newext),
1807 ext4_ext_is_uninitialized(newext),
1808 ext4_ext_get_actual_len(newext),
1809 nearex, len, nearex, nearex + 1);
1810 memmove(nearex + 1, nearex, len);
1811 path[depth].p_ext = nearex;
1812 } 1802 }
1813 1803
1814 le16_add_cpu(&eh->eh_entries, 1); 1804 le16_add_cpu(&eh->eh_entries, 1);
1815 nearex = path[depth].p_ext; 1805 path[depth].p_ext = nearex;
1816 nearex->ee_block = newext->ee_block; 1806 nearex->ee_block = newext->ee_block;
1817 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1807 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1818 nearex->ee_len = newext->ee_len; 1808 nearex->ee_len = newext->ee_len;
@@ -1962,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1962 struct ext4_ext_cache *cex; 1952 struct ext4_ext_cache *cex;
1963 BUG_ON(len == 0); 1953 BUG_ON(len == 0);
1964 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1954 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1955 trace_ext4_ext_put_in_cache(inode, block, len, start);
1965 cex = &EXT4_I(inode)->i_cached_extent; 1956 cex = &EXT4_I(inode)->i_cached_extent;
1966 cex->ec_block = block; 1957 cex->ec_block = block;
1967 cex->ec_len = len; 1958 cex->ec_len = len;
@@ -2063,6 +2054,7 @@ errout:
2063 sbi->extent_cache_misses++; 2054 sbi->extent_cache_misses++;
2064 else 2055 else
2065 sbi->extent_cache_hits++; 2056 sbi->extent_cache_hits++;
2057 trace_ext4_ext_in_cache(inode, block, ret);
2066 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2058 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2067 return ret; 2059 return ret;
2068} 2060}
@@ -2130,6 +2122,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2130 if (err) 2122 if (err)
2131 return err; 2123 return err;
2132 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2124 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2125 trace_ext4_ext_rm_idx(inode, leaf);
2126
2133 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2127 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2134 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2128 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2135 return err; 2129 return err;
@@ -2158,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2158 * need to account for leaf block credit 2152 * need to account for leaf block credit
2159 * 2153 *
2160 * bitmaps and block group descriptor blocks 2154 * bitmaps and block group descriptor blocks
2161 * and other metadat blocks still need to be 2155 * and other metadata blocks still need to be
2162 * accounted. 2156 * accounted.
2163 */ 2157 */
2164 /* 1 bitmap, 1 block group descriptor */ 2158 /* 1 bitmap, 1 block group descriptor */
@@ -2195,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2195} 2189}
2196 2190
2197static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2191static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2198 struct ext4_extent *ex, 2192 struct ext4_extent *ex,
2199 ext4_lblk_t from, ext4_lblk_t to) 2193 ext4_fsblk_t *partial_cluster,
2194 ext4_lblk_t from, ext4_lblk_t to)
2200{ 2195{
2196 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2201 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2197 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2198 ext4_fsblk_t pblk;
2202 int flags = EXT4_FREE_BLOCKS_FORGET; 2199 int flags = EXT4_FREE_BLOCKS_FORGET;
2203 2200
2204 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2201 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2205 flags |= EXT4_FREE_BLOCKS_METADATA; 2202 flags |= EXT4_FREE_BLOCKS_METADATA;
2203 /*
2204 * For bigalloc file systems, we never free a partial cluster
2205 * at the beginning of the extent. Instead, we make a note
2206 * that we tried freeing the cluster, and check to see if we
2207 * need to free it on a subsequent call to ext4_remove_blocks,
2208 * or at the end of the ext4_truncate() operation.
2209 */
2210 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2211
2212 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2213 /*
2214 * If we have a partial cluster, and it's different from the
2215 * cluster of the last block, we need to explicitly free the
2216 * partial cluster here.
2217 */
2218 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2219 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2220 ext4_free_blocks(handle, inode, NULL,
2221 EXT4_C2B(sbi, *partial_cluster),
2222 sbi->s_cluster_ratio, flags);
2223 *partial_cluster = 0;
2224 }
2225
2206#ifdef EXTENTS_STATS 2226#ifdef EXTENTS_STATS
2207 { 2227 {
2208 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2228 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2222,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2222 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2242 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2223 /* tail removal */ 2243 /* tail removal */
2224 ext4_lblk_t num; 2244 ext4_lblk_t num;
2225 ext4_fsblk_t start;
2226 2245
2227 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2246 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2228 start = ext4_ext_pblock(ex) + ee_len - num; 2247 pblk = ext4_ext_pblock(ex) + ee_len - num;
2229 ext_debug("free last %u blocks starting %llu\n", num, start); 2248 ext_debug("free last %u blocks starting %llu\n", num, pblk);
2230 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2249 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2250 /*
2251 * If the block range to be freed didn't start at the
2252 * beginning of a cluster, and we removed the entire
2253 * extent, save the partial cluster here, since we
2254 * might need to delete if we determine that the
2255 * truncate operation has removed all of the blocks in
2256 * the cluster.
2257 */
2258 if (pblk & (sbi->s_cluster_ratio - 1) &&
2259 (ee_len == num))
2260 *partial_cluster = EXT4_B2C(sbi, pblk);
2261 else
2262 *partial_cluster = 0;
2231 } else if (from == le32_to_cpu(ex->ee_block) 2263 } else if (from == le32_to_cpu(ex->ee_block)
2232 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2264 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2233 /* head removal */ 2265 /* head removal */
@@ -2238,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2238 start = ext4_ext_pblock(ex); 2270 start = ext4_ext_pblock(ex);
2239 2271
2240 ext_debug("free first %u blocks starting %llu\n", num, start); 2272 ext_debug("free first %u blocks starting %llu\n", num, start);
2241 ext4_free_blocks(handle, inode, 0, start, num, flags); 2273 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2242 2274
2243 } else { 2275 } else {
2244 printk(KERN_INFO "strange request: removal(2) " 2276 printk(KERN_INFO "strange request: removal(2) "
@@ -2262,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2262 */ 2294 */
2263static int 2295static int
2264ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2296ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2265 struct ext4_ext_path *path, ext4_lblk_t start, 2297 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2266 ext4_lblk_t end) 2298 ext4_lblk_t start, ext4_lblk_t end)
2267{ 2299{
2300 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2268 int err = 0, correct_index = 0; 2301 int err = 0, correct_index = 0;
2269 int depth = ext_depth(inode), credits; 2302 int depth = ext_depth(inode), credits;
2270 struct ext4_extent_header *eh; 2303 struct ext4_extent_header *eh;
2271 ext4_lblk_t a, b, block; 2304 ext4_lblk_t a, b;
2272 unsigned num; 2305 unsigned num;
2273 ext4_lblk_t ex_ee_block; 2306 ext4_lblk_t ex_ee_block;
2274 unsigned short ex_ee_len; 2307 unsigned short ex_ee_len;
2275 unsigned uninitialized = 0; 2308 unsigned uninitialized = 0;
2276 struct ext4_extent *ex; 2309 struct ext4_extent *ex;
2277 struct ext4_map_blocks map;
2278 2310
2279 /* the header must be checked already in ext4_ext_remove_space() */ 2311 /* the header must be checked already in ext4_ext_remove_space() */
2280 ext_debug("truncate since %u in leaf\n", start); 2312 ext_debug("truncate since %u in leaf\n", start);
@@ -2291,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2291 ex_ee_block = le32_to_cpu(ex->ee_block); 2323 ex_ee_block = le32_to_cpu(ex->ee_block);
2292 ex_ee_len = ext4_ext_get_actual_len(ex); 2324 ex_ee_len = ext4_ext_get_actual_len(ex);
2293 2325
2326 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2327
2294 while (ex >= EXT_FIRST_EXTENT(eh) && 2328 while (ex >= EXT_FIRST_EXTENT(eh) &&
2295 ex_ee_block + ex_ee_len > start) { 2329 ex_ee_block + ex_ee_len > start) {
2296 2330
@@ -2315,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2349 ex_ee_block = le32_to_cpu(ex->ee_block);
2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2350 ex_ee_len = ext4_ext_get_actual_len(ex);
2317 continue; 2351 continue;
2318 } else if (a != ex_ee_block && 2352 } else if (b != ex_ee_block + ex_ee_len - 1) {
2319 b != ex_ee_block + ex_ee_len - 1) { 2353 EXT4_ERROR_INODE(inode," bad truncate %u:%u\n",
2320 /* 2354 start, end);
2321 * If this is a truncate, then this condition should 2355 err = -EIO;
2322 * never happen because at least one of the end points 2356 goto out;
2323 * needs to be on the edge of the extent.
2324 */
2325 if (end == EXT_MAX_BLOCKS - 1) {
2326 ext_debug(" bad truncate %u:%u\n",
2327 start, end);
2328 block = 0;
2329 num = 0;
2330 err = -EIO;
2331 goto out;
2332 }
2333 /*
2334 * else this is a hole punch, so the extent needs to
2335 * be split since neither edge of the hole is on the
2336 * extent edge
2337 */
2338 else{
2339 map.m_pblk = ext4_ext_pblock(ex);
2340 map.m_lblk = ex_ee_block;
2341 map.m_len = b - ex_ee_block;
2342
2343 err = ext4_split_extent(handle,
2344 inode, path, &map, 0,
2345 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2346 EXT4_GET_BLOCKS_PRE_IO);
2347
2348 if (err < 0)
2349 goto out;
2350
2351 ex_ee_len = ext4_ext_get_actual_len(ex);
2352
2353 b = ex_ee_block+ex_ee_len - 1 < end ?
2354 ex_ee_block+ex_ee_len - 1 : end;
2355
2356 /* Then remove tail of this extent */
2357 block = ex_ee_block;
2358 num = a - block;
2359 }
2360 } else if (a != ex_ee_block) { 2357 } else if (a != ex_ee_block) {
2361 /* remove tail of the extent */ 2358 /* remove tail of the extent */
2362 block = ex_ee_block; 2359 num = a - ex_ee_block;
2363 num = a - block;
2364 } else if (b != ex_ee_block + ex_ee_len - 1) {
2365 /* remove head of the extent */
2366 block = b;
2367 num = ex_ee_block + ex_ee_len - b;
2368
2369 /*
2370 * If this is a truncate, this condition
2371 * should never happen
2372 */
2373 if (end == EXT_MAX_BLOCKS - 1) {
2374 ext_debug(" bad truncate %u:%u\n",
2375 start, end);
2376 err = -EIO;
2377 goto out;
2378 }
2379 } else { 2360 } else {
2380 /* remove whole extent: excellent! */ 2361 /* remove whole extent: excellent! */
2381 block = ex_ee_block;
2382 num = 0; 2362 num = 0;
2383 if (a != ex_ee_block) {
2384 ext_debug(" bad truncate %u:%u\n",
2385 start, end);
2386 err = -EIO;
2387 goto out;
2388 }
2389
2390 if (b != ex_ee_block + ex_ee_len - 1) {
2391 ext_debug(" bad truncate %u:%u\n",
2392 start, end);
2393 err = -EIO;
2394 goto out;
2395 }
2396 } 2363 }
2397
2398 /* 2364 /*
2399 * 3 for leaf, sb, and inode plus 2 (bmap and group 2365 * 3 for leaf, sb, and inode plus 2 (bmap and group
2400 * descriptor) for each block group; assume two block 2366 * descriptor) for each block group; assume two block
@@ -2416,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2416 if (err) 2382 if (err)
2417 goto out; 2383 goto out;
2418 2384
2419 err = ext4_remove_blocks(handle, inode, ex, a, b); 2385 err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2386 a, b);
2420 if (err) 2387 if (err)
2421 goto out; 2388 goto out;
2422 2389
2423 if (num == 0) { 2390 if (num == 0)
2424 /* this extent is removed; mark slot entirely unused */ 2391 /* this extent is removed; mark slot entirely unused */
2425 ext4_ext_store_pblock(ex, 0); 2392 ext4_ext_store_pblock(ex, 0);
2426 } else if (block != ex_ee_block) {
2427 /*
2428 * If this was a head removal, then we need to update
2429 * the physical block since it is now at a different
2430 * location
2431 */
2432 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2433 }
2434 2393
2435 ex->ee_block = cpu_to_le32(block);
2436 ex->ee_len = cpu_to_le16(num); 2394 ex->ee_len = cpu_to_le16(num);
2437 /* 2395 /*
2438 * Do not mark uninitialized if all the blocks in the 2396 * Do not mark uninitialized if all the blocks in the
@@ -2440,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2440 */ 2398 */
2441 if (uninitialized && num) 2399 if (uninitialized && num)
2442 ext4_ext_mark_uninitialized(ex); 2400 ext4_ext_mark_uninitialized(ex);
2443
2444 err = ext4_ext_dirty(handle, inode, path + depth);
2445 if (err)
2446 goto out;
2447
2448 /* 2401 /*
2449 * If the extent was completely released, 2402 * If the extent was completely released,
2450 * we need to remove it from the leaf 2403 * we need to remove it from the leaf
@@ -2464,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2464 sizeof(struct ext4_extent)); 2417 sizeof(struct ext4_extent));
2465 } 2418 }
2466 le16_add_cpu(&eh->eh_entries, -1); 2419 le16_add_cpu(&eh->eh_entries, -1);
2467 } 2420 } else
2421 *partial_cluster = 0;
2468 2422
2469 ext_debug("new extent: %u:%u:%llu\n", block, num, 2423 err = ext4_ext_dirty(handle, inode, path + depth);
2424 if (err)
2425 goto out;
2426
2427 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2470 ext4_ext_pblock(ex)); 2428 ext4_ext_pblock(ex));
2471 ex--; 2429 ex--;
2472 ex_ee_block = le32_to_cpu(ex->ee_block); 2430 ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2476,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2476 if (correct_index && eh->eh_entries) 2434 if (correct_index && eh->eh_entries)
2477 err = ext4_ext_correct_indexes(handle, inode, path); 2435 err = ext4_ext_correct_indexes(handle, inode, path);
2478 2436
2437 /*
2438 * If there is still a entry in the leaf node, check to see if
2439 * it references the partial cluster. This is the only place
2440 * where it could; if it doesn't, we can free the cluster.
2441 */
2442 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2443 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2444 *partial_cluster)) {
2445 int flags = EXT4_FREE_BLOCKS_FORGET;
2446
2447 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2448 flags |= EXT4_FREE_BLOCKS_METADATA;
2449
2450 ext4_free_blocks(handle, inode, NULL,
2451 EXT4_C2B(sbi, *partial_cluster),
2452 sbi->s_cluster_ratio, flags);
2453 *partial_cluster = 0;
2454 }
2455
2479 /* if this leaf is free, then we should 2456 /* if this leaf is free, then we should
2480 * remove it from index block above */ 2457 * remove it from index block above */
2481 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2458 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2511,6 +2488,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2511 struct super_block *sb = inode->i_sb; 2488 struct super_block *sb = inode->i_sb;
2512 int depth = ext_depth(inode); 2489 int depth = ext_depth(inode);
2513 struct ext4_ext_path *path; 2490 struct ext4_ext_path *path;
2491 ext4_fsblk_t partial_cluster = 0;
2514 handle_t *handle; 2492 handle_t *handle;
2515 int i, err; 2493 int i, err;
2516 2494
@@ -2524,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2524again: 2502again:
2525 ext4_ext_invalidate_cache(inode); 2503 ext4_ext_invalidate_cache(inode);
2526 2504
2505 trace_ext4_ext_remove_space(inode, start, depth);
2506
2527 /* 2507 /*
2528 * We start scanning from right side, freeing all the blocks 2508 * We start scanning from right side, freeing all the blocks
2529 * after i_size and walking into the tree depth-wise. 2509 * after i_size and walking into the tree depth-wise.
@@ -2546,7 +2526,8 @@ again:
2546 if (i == depth) { 2526 if (i == depth) {
2547 /* this is leaf block */ 2527 /* this is leaf block */
2548 err = ext4_ext_rm_leaf(handle, inode, path, 2528 err = ext4_ext_rm_leaf(handle, inode, path,
2549 start, EXT_MAX_BLOCKS - 1); 2529 &partial_cluster, start,
2530 EXT_MAX_BLOCKS - 1);
2550 /* root level has p_bh == NULL, brelse() eats this */ 2531 /* root level has p_bh == NULL, brelse() eats this */
2551 brelse(path[i].p_bh); 2532 brelse(path[i].p_bh);
2552 path[i].p_bh = NULL; 2533 path[i].p_bh = NULL;
@@ -2618,6 +2599,24 @@ again:
2618 } 2599 }
2619 } 2600 }
2620 2601
2602 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2603 path->p_hdr->eh_entries);
2604
2605 /* If we still have something in the partial cluster and we have removed
2606 * even the first extent, then we should free the blocks in the partial
2607 * cluster as well. */
2608 if (partial_cluster && path->p_hdr->eh_entries == 0) {
2609 int flags = EXT4_FREE_BLOCKS_FORGET;
2610
2611 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2612 flags |= EXT4_FREE_BLOCKS_METADATA;
2613
2614 ext4_free_blocks(handle, inode, NULL,
2615 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2616 EXT4_SB(sb)->s_cluster_ratio, flags);
2617 partial_cluster = 0;
2618 }
2619
2621 /* TODO: flexible tree reduction should be here */ 2620 /* TODO: flexible tree reduction should be here */
2622 if (path->p_hdr->eh_entries == 0) { 2621 if (path->p_hdr->eh_entries == 0) {
2623 /* 2622 /*
@@ -2909,17 +2908,29 @@ out:
2909 * a> There is no split required: Entire extent should be initialized 2908 * a> There is no split required: Entire extent should be initialized
2910 * b> Splits in two extents: Write is happening at either end of the extent 2909 * b> Splits in two extents: Write is happening at either end of the extent
2911 * c> Splits in three extents: Somone is writing in middle of the extent 2910 * c> Splits in three extents: Somone is writing in middle of the extent
2911 *
2912 * Pre-conditions:
2913 * - The extent pointed to by 'path' is uninitialized.
2914 * - The extent pointed to by 'path' contains a superset
2915 * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
2916 *
2917 * Post-conditions on success:
2918 * - the returned value is the number of blocks beyond map->l_lblk
2919 * that are allocated and initialized.
2920 * It is guaranteed to be >= map->m_len.
2912 */ 2921 */
2913static int ext4_ext_convert_to_initialized(handle_t *handle, 2922static int ext4_ext_convert_to_initialized(handle_t *handle,
2914 struct inode *inode, 2923 struct inode *inode,
2915 struct ext4_map_blocks *map, 2924 struct ext4_map_blocks *map,
2916 struct ext4_ext_path *path) 2925 struct ext4_ext_path *path)
2917{ 2926{
2927 struct ext4_extent_header *eh;
2918 struct ext4_map_blocks split_map; 2928 struct ext4_map_blocks split_map;
2919 struct ext4_extent zero_ex; 2929 struct ext4_extent zero_ex;
2920 struct ext4_extent *ex; 2930 struct ext4_extent *ex;
2921 ext4_lblk_t ee_block, eof_block; 2931 ext4_lblk_t ee_block, eof_block;
2922 unsigned int allocated, ee_len, depth; 2932 unsigned int ee_len, depth;
2933 int allocated;
2923 int err = 0; 2934 int err = 0;
2924 int split_flag = 0; 2935 int split_flag = 0;
2925 2936
@@ -2933,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2933 eof_block = map->m_lblk + map->m_len; 2944 eof_block = map->m_lblk + map->m_len;
2934 2945
2935 depth = ext_depth(inode); 2946 depth = ext_depth(inode);
2947 eh = path[depth].p_hdr;
2936 ex = path[depth].p_ext; 2948 ex = path[depth].p_ext;
2937 ee_block = le32_to_cpu(ex->ee_block); 2949 ee_block = le32_to_cpu(ex->ee_block);
2938 ee_len = ext4_ext_get_actual_len(ex); 2950 ee_len = ext4_ext_get_actual_len(ex);
2939 allocated = ee_len - (map->m_lblk - ee_block); 2951 allocated = ee_len - (map->m_lblk - ee_block);
2940 2952
2953 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
2954
2955 /* Pre-conditions */
2956 BUG_ON(!ext4_ext_is_uninitialized(ex));
2957 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2958 BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2959
2960 /*
2961 * Attempt to transfer newly initialized blocks from the currently
2962 * uninitialized extent to its left neighbor. This is much cheaper
2963 * than an insertion followed by a merge as those involve costly
2964 * memmove() calls. This is the common case in steady state for
2965 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
2966 * writes.
2967 *
2968 * Limitations of the current logic:
2969 * - L1: we only deal with writes at the start of the extent.
2970 * The approach could be extended to writes at the end
2971 * of the extent but this scenario was deemed less common.
2972 * - L2: we do not deal with writes covering the whole extent.
2973 * This would require removing the extent if the transfer
2974 * is possible.
2975 * - L3: we only attempt to merge with an extent stored in the
2976 * same extent tree node.
2977 */
2978 if ((map->m_lblk == ee_block) && /*L1*/
2979 (map->m_len < ee_len) && /*L2*/
2980 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
2981 struct ext4_extent *prev_ex;
2982 ext4_lblk_t prev_lblk;
2983 ext4_fsblk_t prev_pblk, ee_pblk;
2984 unsigned int prev_len, write_len;
2985
2986 prev_ex = ex - 1;
2987 prev_lblk = le32_to_cpu(prev_ex->ee_block);
2988 prev_len = ext4_ext_get_actual_len(prev_ex);
2989 prev_pblk = ext4_ext_pblock(prev_ex);
2990 ee_pblk = ext4_ext_pblock(ex);
2991 write_len = map->m_len;
2992
2993 /*
2994 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
2995 * upon those conditions:
2996 * - C1: prev_ex is initialized,
2997 * - C2: prev_ex is logically abutting ex,
2998 * - C3: prev_ex is physically abutting ex,
2999 * - C4: prev_ex can receive the additional blocks without
3000 * overflowing the (initialized) length limit.
3001 */
3002 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
3003 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3004 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3005 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
3006 err = ext4_ext_get_access(handle, inode, path + depth);
3007 if (err)
3008 goto out;
3009
3010 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3011 map, ex, prev_ex);
3012
3013 /* Shift the start of ex by 'write_len' blocks */
3014 ex->ee_block = cpu_to_le32(ee_block + write_len);
3015 ext4_ext_store_pblock(ex, ee_pblk + write_len);
3016 ex->ee_len = cpu_to_le16(ee_len - write_len);
3017 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3018
3019 /* Extend prev_ex by 'write_len' blocks */
3020 prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3021
3022 /* Mark the block containing both extents as dirty */
3023 ext4_ext_dirty(handle, inode, path + depth);
3024
3025 /* Update path to point to the right extent */
3026 path[depth].p_ext = prev_ex;
3027
3028 /* Result: number of initialized blocks past m_lblk */
3029 allocated = write_len;
3030 goto out;
3031 }
3032 }
3033
2941 WARN_ON(map->m_lblk < ee_block); 3034 WARN_ON(map->m_lblk < ee_block);
2942 /* 3035 /*
2943 * It is safe to convert extent to initialized via explicit 3036 * It is safe to convert extent to initialized via explicit
@@ -3165,6 +3258,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3165 return ext4_mark_inode_dirty(handle, inode); 3258 return ext4_mark_inode_dirty(handle, inode);
3166} 3259}
3167 3260
3261/**
3262 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3263 *
3264 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3265 * whether there are any buffers marked for delayed allocation. It returns '1'
3266 * on the first delalloc'ed buffer head found. If no buffer head in the given
3267 * range is marked for delalloc, it returns 0.
3268 * lblk_start should always be <= lblk_end.
3269 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3270 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3271 * block sooner). This is useful when blocks are truncated sequentially from
3272 * lblk_start towards lblk_end.
3273 */
3274static int ext4_find_delalloc_range(struct inode *inode,
3275 ext4_lblk_t lblk_start,
3276 ext4_lblk_t lblk_end,
3277 int search_hint_reverse)
3278{
3279 struct address_space *mapping = inode->i_mapping;
3280 struct buffer_head *head, *bh = NULL;
3281 struct page *page;
3282 ext4_lblk_t i, pg_lblk;
3283 pgoff_t index;
3284
3285 /* reverse search wont work if fs block size is less than page size */
3286 if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3287 search_hint_reverse = 0;
3288
3289 if (search_hint_reverse)
3290 i = lblk_end;
3291 else
3292 i = lblk_start;
3293
3294 index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3295
3296 while ((i >= lblk_start) && (i <= lblk_end)) {
3297 page = find_get_page(mapping, index);
3298 if (!page)
3299 goto nextpage;
3300
3301 if (!page_has_buffers(page))
3302 goto nextpage;
3303
3304 head = page_buffers(page);
3305 if (!head)
3306 goto nextpage;
3307
3308 bh = head;
3309 pg_lblk = index << (PAGE_CACHE_SHIFT -
3310 inode->i_blkbits);
3311 do {
3312 if (unlikely(pg_lblk < lblk_start)) {
3313 /*
3314 * This is possible when fs block size is less
3315 * than page size and our cluster starts/ends in
3316 * middle of the page. So we need to skip the
3317 * initial few blocks till we reach the 'lblk'
3318 */
3319 pg_lblk++;
3320 continue;
3321 }
3322
3323 /* Check if the buffer is delayed allocated and that it
3324 * is not yet mapped. (when da-buffers are mapped during
3325 * their writeout, their da_mapped bit is set.)
3326 */
3327 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3328 page_cache_release(page);
3329 trace_ext4_find_delalloc_range(inode,
3330 lblk_start, lblk_end,
3331 search_hint_reverse,
3332 1, i);
3333 return 1;
3334 }
3335 if (search_hint_reverse)
3336 i--;
3337 else
3338 i++;
3339 } while ((i >= lblk_start) && (i <= lblk_end) &&
3340 ((bh = bh->b_this_page) != head));
3341nextpage:
3342 if (page)
3343 page_cache_release(page);
3344 /*
3345 * Move to next page. 'i' will be the first lblk in the next
3346 * page.
3347 */
3348 if (search_hint_reverse)
3349 index--;
3350 else
3351 index++;
3352 i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3353 }
3354
3355 trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3356 search_hint_reverse, 0, 0);
3357 return 0;
3358}
3359
3360int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3361 int search_hint_reverse)
3362{
3363 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3364 ext4_lblk_t lblk_start, lblk_end;
3365 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3366 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3367
3368 return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3369 search_hint_reverse);
3370}
3371
3372/**
3373 * Determines how many complete clusters (out of those specified by the 'map')
3374 * are under delalloc and were reserved quota for.
3375 * This function is called when we are writing out the blocks that were
3376 * originally written with their allocation delayed, but then the space was
3377 * allocated using fallocate() before the delayed allocation could be resolved.
3378 * The cases to look for are:
3379 * ('=' indicated delayed allocated blocks
3380 * '-' indicates non-delayed allocated blocks)
3381 * (a) partial clusters towards beginning and/or end outside of allocated range
3382 * are not delalloc'ed.
3383 * Ex:
3384 * |----c---=|====c====|====c====|===-c----|
3385 * |++++++ allocated ++++++|
3386 * ==> 4 complete clusters in above example
3387 *
3388 * (b) partial cluster (outside of allocated range) towards either end is
3389 * marked for delayed allocation. In this case, we will exclude that
3390 * cluster.
3391 * Ex:
3392 * |----====c========|========c========|
3393 * |++++++ allocated ++++++|
3394 * ==> 1 complete clusters in above example
3395 *
3396 * Ex:
3397 * |================c================|
3398 * |++++++ allocated ++++++|
3399 * ==> 0 complete clusters in above example
3400 *
3401 * The ext4_da_update_reserve_space will be called only if we
3402 * determine here that there were some "entire" clusters that span
3403 * this 'allocated' range.
3404 * In the non-bigalloc case, this function will just end up returning num_blks
3405 * without ever calling ext4_find_delalloc_range.
3406 */
3407static unsigned int
3408get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3409 unsigned int num_blks)
3410{
3411 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3412 ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3413 ext4_lblk_t lblk_from, lblk_to, c_offset;
3414 unsigned int allocated_clusters = 0;
3415
3416 alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3417 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3418
3419 /* max possible clusters for this allocation */
3420 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3421
3422 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3423
3424 /* Check towards left side */
3425 c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3426 if (c_offset) {
3427 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3428 lblk_to = lblk_from + c_offset - 1;
3429
3430 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3431 allocated_clusters--;
3432 }
3433
3434 /* Now check towards right. */
3435 c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3436 if (allocated_clusters && c_offset) {
3437 lblk_from = lblk_start + num_blks;
3438 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3439
3440 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3441 allocated_clusters--;
3442 }
3443
3444 return allocated_clusters;
3445}
3446
3168static int 3447static int
3169ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3448ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3170 struct ext4_map_blocks *map, 3449 struct ext4_map_blocks *map,
@@ -3181,6 +3460,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3181 flags, allocated); 3460 flags, allocated);
3182 ext4_ext_show_leaf(inode, path); 3461 ext4_ext_show_leaf(inode, path);
3183 3462
3463 trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3464 newblock);
3465
3184 /* get_block() before submit the IO, split the extent */ 3466 /* get_block() before submit the IO, split the extent */
3185 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3467 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3186 ret = ext4_split_unwritten_extents(handle, inode, map, 3468 ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3190,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3190 * that this IO needs to conversion to written when IO is 3472 * that this IO needs to conversion to written when IO is
3191 * completed 3473 * completed
3192 */ 3474 */
3193 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3475 if (io)
3194 io->flag = EXT4_IO_END_UNWRITTEN; 3476 ext4_set_io_unwritten_flag(inode, io);
3195 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3477 else
3196 } else
3197 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3478 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3198 if (ext4_should_dioread_nolock(inode)) 3479 if (ext4_should_dioread_nolock(inode))
3199 map->m_flags |= EXT4_MAP_UNINIT; 3480 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3234,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3234 3515
3235 /* buffered write, writepage time, convert*/ 3516 /* buffered write, writepage time, convert*/
3236 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3517 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3237 if (ret >= 0) { 3518 if (ret >= 0)
3238 ext4_update_inode_fsync_trans(handle, inode, 1); 3519 ext4_update_inode_fsync_trans(handle, inode, 1);
3239 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3240 map->m_len);
3241 if (err < 0)
3242 goto out2;
3243 }
3244
3245out: 3520out:
3246 if (ret <= 0) { 3521 if (ret <= 0) {
3247 err = ret; 3522 err = ret;
@@ -3270,11 +3545,24 @@ out:
3270 * But fallocate would have already updated quota and block 3545 * But fallocate would have already updated quota and block
3271 * count for this offset. So cancel these reservation 3546 * count for this offset. So cancel these reservation
3272 */ 3547 */
3273 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3548 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3274 ext4_da_update_reserve_space(inode, allocated, 0); 3549 unsigned int reserved_clusters;
3550 reserved_clusters = get_reserved_cluster_alloc(inode,
3551 map->m_lblk, map->m_len);
3552 if (reserved_clusters)
3553 ext4_da_update_reserve_space(inode,
3554 reserved_clusters,
3555 0);
3556 }
3275 3557
3276map_out: 3558map_out:
3277 map->m_flags |= EXT4_MAP_MAPPED; 3559 map->m_flags |= EXT4_MAP_MAPPED;
3560 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3561 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3562 map->m_len);
3563 if (err < 0)
3564 goto out2;
3565 }
3278out1: 3566out1:
3279 if (allocated > map->m_len) 3567 if (allocated > map->m_len)
3280 allocated = map->m_len; 3568 allocated = map->m_len;
@@ -3290,6 +3578,111 @@ out2:
3290} 3578}
3291 3579
3292/* 3580/*
3581 * get_implied_cluster_alloc - check to see if the requested
3582 * allocation (in the map structure) overlaps with a cluster already
3583 * allocated in an extent.
3584 * @sb The filesystem superblock structure
3585 * @map The requested lblk->pblk mapping
3586 * @ex The extent structure which might contain an implied
3587 * cluster allocation
3588 *
3589 * This function is called by ext4_ext_map_blocks() after we failed to
3590 * find blocks that were already in the inode's extent tree. Hence,
3591 * we know that the beginning of the requested region cannot overlap
3592 * the extent from the inode's extent tree. There are three cases we
3593 * want to catch. The first is this case:
3594 *
3595 * |--- cluster # N--|
3596 * |--- extent ---| |---- requested region ---|
3597 * |==========|
3598 *
3599 * The second case that we need to test for is this one:
3600 *
3601 * |--------- cluster # N ----------------|
3602 * |--- requested region --| |------- extent ----|
3603 * |=======================|
3604 *
3605 * The third case is when the requested region lies between two extents
3606 * within the same cluster:
3607 * |------------- cluster # N-------------|
3608 * |----- ex -----| |---- ex_right ----|
3609 * |------ requested region ------|
3610 * |================|
3611 *
3612 * In each of the above cases, we need to set the map->m_pblk and
3613 * map->m_len so it corresponds to the return the extent labelled as
3614 * "|====|" from cluster #N, since it is already in use for data in
3615 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
3616 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3617 * as a new "allocated" block region. Otherwise, we will return 0 and
3618 * ext4_ext_map_blocks() will then allocate one or more new clusters
3619 * by calling ext4_mb_new_blocks().
3620 */
3621static int get_implied_cluster_alloc(struct super_block *sb,
3622 struct ext4_map_blocks *map,
3623 struct ext4_extent *ex,
3624 struct ext4_ext_path *path)
3625{
3626 struct ext4_sb_info *sbi = EXT4_SB(sb);
3627 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3628 ext4_lblk_t ex_cluster_start, ex_cluster_end;
3629 ext4_lblk_t rr_cluster_start, rr_cluster_end;
3630 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3631 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3632 unsigned short ee_len = ext4_ext_get_actual_len(ex);
3633
3634 /* The extent passed in that we are trying to match */
3635 ex_cluster_start = EXT4_B2C(sbi, ee_block);
3636 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3637
3638 /* The requested region passed into ext4_map_blocks() */
3639 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3640 rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3641
3642 if ((rr_cluster_start == ex_cluster_end) ||
3643 (rr_cluster_start == ex_cluster_start)) {
3644 if (rr_cluster_start == ex_cluster_end)
3645 ee_start += ee_len - 1;
3646 map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3647 c_offset;
3648 map->m_len = min(map->m_len,
3649 (unsigned) sbi->s_cluster_ratio - c_offset);
3650 /*
3651 * Check for and handle this case:
3652 *
3653 * |--------- cluster # N-------------|
3654 * |------- extent ----|
3655 * |--- requested region ---|
3656 * |===========|
3657 */
3658
3659 if (map->m_lblk < ee_block)
3660 map->m_len = min(map->m_len, ee_block - map->m_lblk);
3661
3662 /*
3663 * Check for the case where there is already another allocated
3664 * block to the right of 'ex' but before the end of the cluster.
3665 *
3666 * |------------- cluster # N-------------|
3667 * |----- ex -----| |---- ex_right ----|
3668 * |------ requested region ------|
3669 * |================|
3670 */
3671 if (map->m_lblk > ee_block) {
3672 ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3673 map->m_len = min(map->m_len, next - map->m_lblk);
3674 }
3675
3676 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3677 return 1;
3678 }
3679
3680 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3681 return 0;
3682}
3683
3684
3685/*
3293 * Block allocation/map/preallocation routine for extents based files 3686 * Block allocation/map/preallocation routine for extents based files
3294 * 3687 *
3295 * 3688 *
@@ -3311,15 +3704,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3311 struct ext4_map_blocks *map, int flags) 3704 struct ext4_map_blocks *map, int flags)
3312{ 3705{
3313 struct ext4_ext_path *path = NULL; 3706 struct ext4_ext_path *path = NULL;
3314 struct ext4_extent newex, *ex; 3707 struct ext4_extent newex, *ex, *ex2;
3708 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3315 ext4_fsblk_t newblock = 0; 3709 ext4_fsblk_t newblock = 0;
3316 int err = 0, depth, ret; 3710 int free_on_err = 0, err = 0, depth, ret;
3317 unsigned int allocated = 0; 3711 unsigned int allocated = 0, offset = 0;
3712 unsigned int allocated_clusters = 0;
3318 unsigned int punched_out = 0; 3713 unsigned int punched_out = 0;
3319 unsigned int result = 0; 3714 unsigned int result = 0;
3320 struct ext4_allocation_request ar; 3715 struct ext4_allocation_request ar;
3321 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3716 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3322 struct ext4_map_blocks punch_map; 3717 ext4_lblk_t cluster_offset;
3323 3718
3324 ext_debug("blocks %u/%u requested for inode %lu\n", 3719 ext_debug("blocks %u/%u requested for inode %lu\n",
3325 map->m_lblk, map->m_len, inode->i_ino); 3720 map->m_lblk, map->m_len, inode->i_ino);
@@ -3329,6 +3724,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3329 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && 3724 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3330 ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3725 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3331 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3726 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3727 if ((sbi->s_cluster_ratio > 1) &&
3728 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3729 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3730
3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3731 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3333 /* 3732 /*
3334 * block isn't allocated yet and 3733 * block isn't allocated yet and
@@ -3339,6 +3738,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3339 /* we should allocate requested block */ 3738 /* we should allocate requested block */
3340 } else { 3739 } else {
3341 /* block is already allocated */ 3740 /* block is already allocated */
3741 if (sbi->s_cluster_ratio > 1)
3742 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3342 newblock = map->m_lblk 3743 newblock = map->m_lblk
3343 - le32_to_cpu(newex.ee_block) 3744 - le32_to_cpu(newex.ee_block)
3344 + ext4_ext_pblock(&newex); 3745 + ext4_ext_pblock(&newex);
@@ -3384,8 +3785,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3384 * we split out initialized portions during a write. 3785 * we split out initialized portions during a write.
3385 */ 3786 */
3386 ee_len = ext4_ext_get_actual_len(ex); 3787 ee_len = ext4_ext_get_actual_len(ex);
3788
3789 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3790
3387 /* if found extent covers block, simply return it */ 3791 /* if found extent covers block, simply return it */
3388 if (in_range(map->m_lblk, ee_block, ee_len)) { 3792 if (in_range(map->m_lblk, ee_block, ee_len)) {
3793 struct ext4_map_blocks punch_map;
3794 ext4_fsblk_t partial_cluster = 0;
3795
3389 newblock = map->m_lblk - ee_block + ee_start; 3796 newblock = map->m_lblk - ee_block + ee_start;
3390 /* number of remaining blocks in the extent */ 3797 /* number of remaining blocks in the extent */
3391 allocated = ee_len - (map->m_lblk - ee_block); 3798 allocated = ee_len - (map->m_lblk - ee_block);
@@ -3469,7 +3876,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3469 ext4_ext_invalidate_cache(inode); 3876 ext4_ext_invalidate_cache(inode);
3470 3877
3471 err = ext4_ext_rm_leaf(handle, inode, path, 3878 err = ext4_ext_rm_leaf(handle, inode, path,
3472 map->m_lblk, map->m_lblk + punched_out); 3879 &partial_cluster, map->m_lblk,
3880 map->m_lblk + punched_out);
3473 3881
3474 if (!err && path->p_hdr->eh_entries == 0) { 3882 if (!err && path->p_hdr->eh_entries == 0) {
3475 /* 3883 /*
@@ -3492,6 +3900,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3492 } 3900 }
3493 } 3901 }
3494 3902
3903 if ((sbi->s_cluster_ratio > 1) &&
3904 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3905 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3906
3495 /* 3907 /*
3496 * requested block isn't allocated yet; 3908 * requested block isn't allocated yet;
3497 * we couldn't try to create block if create flag is zero 3909 * we couldn't try to create block if create flag is zero
@@ -3504,9 +3916,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3504 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3916 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3505 goto out2; 3917 goto out2;
3506 } 3918 }
3919
3507 /* 3920 /*
3508 * Okay, we need to do block allocation. 3921 * Okay, we need to do block allocation.
3509 */ 3922 */
3923 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3924 newex.ee_block = cpu_to_le32(map->m_lblk);
3925 cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3926
3927 /*
3928 * If we are doing bigalloc, check to see if the extent returned
3929 * by ext4_ext_find_extent() implies a cluster we can use.
3930 */
3931 if (cluster_offset && ex &&
3932 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3933 ar.len = allocated = map->m_len;
3934 newblock = map->m_pblk;
3935 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3936 goto got_allocated_blocks;
3937 }
3510 3938
3511 /* find neighbour allocated blocks */ 3939 /* find neighbour allocated blocks */
3512 ar.lleft = map->m_lblk; 3940 ar.lleft = map->m_lblk;
@@ -3514,10 +3942,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3514 if (err) 3942 if (err)
3515 goto out2; 3943 goto out2;
3516 ar.lright = map->m_lblk; 3944 ar.lright = map->m_lblk;
3517 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3945 ex2 = NULL;
3946 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3518 if (err) 3947 if (err)
3519 goto out2; 3948 goto out2;
3520 3949
3950 /* Check if the extent after searching to the right implies a
3951 * cluster we can use. */
3952 if ((sbi->s_cluster_ratio > 1) && ex2 &&
3953 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3954 ar.len = allocated = map->m_len;
3955 newblock = map->m_pblk;
3956 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3957 goto got_allocated_blocks;
3958 }
3959
3521 /* 3960 /*
3522 * See if request is beyond maximum number of blocks we can have in 3961 * See if request is beyond maximum number of blocks we can have in
3523 * a single extent. For an initialized extent this limit is 3962 * a single extent. For an initialized extent this limit is
@@ -3532,9 +3971,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3532 map->m_len = EXT_UNINIT_MAX_LEN; 3971 map->m_len = EXT_UNINIT_MAX_LEN;
3533 3972
3534 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3973 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3535 newex.ee_block = cpu_to_le32(map->m_lblk);
3536 newex.ee_len = cpu_to_le16(map->m_len); 3974 newex.ee_len = cpu_to_le16(map->m_len);
3537 err = ext4_ext_check_overlap(inode, &newex, path); 3975 err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3538 if (err) 3976 if (err)
3539 allocated = ext4_ext_get_actual_len(&newex); 3977 allocated = ext4_ext_get_actual_len(&newex);
3540 else 3978 else
@@ -3544,7 +3982,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3544 ar.inode = inode; 3982 ar.inode = inode;
3545 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3983 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3546 ar.logical = map->m_lblk; 3984 ar.logical = map->m_lblk;
3547 ar.len = allocated; 3985 /*
3986 * We calculate the offset from the beginning of the cluster
3987 * for the logical block number, since when we allocate a
3988 * physical cluster, the physical block should start at the
3989 * same offset from the beginning of the cluster. This is
3990 * needed so that future calls to get_implied_cluster_alloc()
3991 * work correctly.
3992 */
3993 offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
3994 ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
3995 ar.goal -= offset;
3996 ar.logical -= offset;
3548 if (S_ISREG(inode->i_mode)) 3997 if (S_ISREG(inode->i_mode))
3549 ar.flags = EXT4_MB_HINT_DATA; 3998 ar.flags = EXT4_MB_HINT_DATA;
3550 else 3999 else
@@ -3557,9 +4006,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3557 goto out2; 4006 goto out2;
3558 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4007 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3559 ar.goal, newblock, allocated); 4008 ar.goal, newblock, allocated);
4009 free_on_err = 1;
4010 allocated_clusters = ar.len;
4011 ar.len = EXT4_C2B(sbi, ar.len) - offset;
4012 if (ar.len > allocated)
4013 ar.len = allocated;
3560 4014
4015got_allocated_blocks:
3561 /* try to insert new extent into found leaf and return */ 4016 /* try to insert new extent into found leaf and return */
3562 ext4_ext_store_pblock(&newex, newblock); 4017 ext4_ext_store_pblock(&newex, newblock + offset);
3563 newex.ee_len = cpu_to_le16(ar.len); 4018 newex.ee_len = cpu_to_le16(ar.len);
3564 /* Mark uninitialized */ 4019 /* Mark uninitialized */
3565 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 4020 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3572,10 +4027,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3572 * that we need to perform conversion when IO is done. 4027 * that we need to perform conversion when IO is done.
3573 */ 4028 */
3574 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4029 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3575 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 4030 if (io)
3576 io->flag = EXT4_IO_END_UNWRITTEN; 4031 ext4_set_io_unwritten_flag(inode, io);
3577 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 4032 else
3578 } else
3579 ext4_set_inode_state(inode, 4033 ext4_set_inode_state(inode,
3580 EXT4_STATE_DIO_UNWRITTEN); 4034 EXT4_STATE_DIO_UNWRITTEN);
3581 } 4035 }
@@ -3583,11 +4037,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3583 map->m_flags |= EXT4_MAP_UNINIT; 4037 map->m_flags |= EXT4_MAP_UNINIT;
3584 } 4038 }
3585 4039
3586 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 4040 err = 0;
4041 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4042 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4043 path, ar.len);
3587 if (!err) 4044 if (!err)
3588 err = ext4_ext_insert_extent(handle, inode, path, 4045 err = ext4_ext_insert_extent(handle, inode, path,
3589 &newex, flags); 4046 &newex, flags);
3590 if (err) { 4047 if (err && free_on_err) {
3591 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4048 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3592 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4049 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3593 /* free data blocks we just allocated */ 4050 /* free data blocks we just allocated */
@@ -3610,8 +4067,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3610 * Update reserved blocks/metadata blocks after successful 4067 * Update reserved blocks/metadata blocks after successful
3611 * block allocation which had been deferred till now. 4068 * block allocation which had been deferred till now.
3612 */ 4069 */
3613 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4070 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3614 ext4_da_update_reserve_space(inode, allocated, 1); 4071 unsigned int reserved_clusters;
4072 /*
4073 * Check how many clusters we had reserved this allocated range
4074 */
4075 reserved_clusters = get_reserved_cluster_alloc(inode,
4076 map->m_lblk, allocated);
4077 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4078 if (reserved_clusters) {
4079 /*
4080 * We have clusters reserved for this range.
4081 * But since we are not doing actual allocation
4082 * and are simply using blocks from previously
4083 * allocated cluster, we should release the
4084 * reservation and not claim quota.
4085 */
4086 ext4_da_update_reserve_space(inode,
4087 reserved_clusters, 0);
4088 }
4089 } else {
4090 BUG_ON(allocated_clusters < reserved_clusters);
4091 /* We will claim quota for all newly allocated blocks.*/
4092 ext4_da_update_reserve_space(inode, allocated_clusters,
4093 1);
4094 if (reserved_clusters < allocated_clusters) {
4095 struct ext4_inode_info *ei = EXT4_I(inode);
4096 int reservation = allocated_clusters -
4097 reserved_clusters;
4098 /*
4099 * It seems we claimed few clusters outside of
4100 * the range of this allocation. We should give
4101 * it back to the reservation pool. This can
4102 * happen in the following case:
4103 *
4104 * * Suppose s_cluster_ratio is 4 (i.e., each
4105 * cluster has 4 blocks. Thus, the clusters
4106 * are [0-3],[4-7],[8-11]...
4107 * * First comes delayed allocation write for
4108 * logical blocks 10 & 11. Since there were no
4109 * previous delayed allocated blocks in the
4110 * range [8-11], we would reserve 1 cluster
4111 * for this write.
4112 * * Next comes write for logical blocks 3 to 8.
4113 * In this case, we will reserve 2 clusters
4114 * (for [0-3] and [4-7]; and not for [8-11] as
4115 * that range has a delayed allocated blocks.
4116 * Thus total reserved clusters now becomes 3.
4117 * * Now, during the delayed allocation writeout
4118 * time, we will first write blocks [3-8] and
4119 * allocate 3 clusters for writing these
4120 * blocks. Also, we would claim all these
4121 * three clusters above.
4122 * * Now when we come here to writeout the
4123 * blocks [10-11], we would expect to claim
4124 * the reservation of 1 cluster we had made
4125 * (and we would claim it since there are no
4126 * more delayed allocated blocks in the range
4127 * [8-11]. But our reserved cluster count had
4128 * already gone to 0.
4129 *
4130 * Thus, at the step 4 above when we determine
4131 * that there are still some unwritten delayed
4132 * allocated blocks outside of our current
4133 * block range, we should increment the
4134 * reserved clusters count so that when the
4135 * remaining blocks finally gets written, we
4136 * could claim them.
4137 */
4138 dquot_reserve_block(inode,
4139 EXT4_C2B(sbi, reservation));
4140 spin_lock(&ei->i_block_reservation_lock);
4141 ei->i_reserved_data_blocks += reservation;
4142 spin_unlock(&ei->i_block_reservation_lock);
4143 }
4144 }
4145 }
3615 4146
3616 /* 4147 /*
3617 * Cache the extent and update transaction to commit on fdatasync only 4148 * Cache the extent and update transaction to commit on fdatasync only
@@ -3634,12 +4165,12 @@ out2:
3634 ext4_ext_drop_refs(path); 4165 ext4_ext_drop_refs(path);
3635 kfree(path); 4166 kfree(path);
3636 } 4167 }
3637 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3638 newblock, map->m_len, err ? err : allocated);
3639
3640 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? 4168 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3641 punched_out : allocated; 4169 punched_out : allocated;
3642 4170
4171 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4172 newblock, map->m_len, err ? err : result);
4173
3643 return err ? err : result; 4174 return err ? err : result;
3644} 4175}
3645 4176
@@ -3649,6 +4180,7 @@ void ext4_ext_truncate(struct inode *inode)
3649 struct super_block *sb = inode->i_sb; 4180 struct super_block *sb = inode->i_sb;
3650 ext4_lblk_t last_block; 4181 ext4_lblk_t last_block;
3651 handle_t *handle; 4182 handle_t *handle;
4183 loff_t page_len;
3652 int err = 0; 4184 int err = 0;
3653 4185
3654 /* 4186 /*
@@ -3665,8 +4197,16 @@ void ext4_ext_truncate(struct inode *inode)
3665 if (IS_ERR(handle)) 4197 if (IS_ERR(handle))
3666 return; 4198 return;
3667 4199
3668 if (inode->i_size & (sb->s_blocksize - 1)) 4200 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
3669 ext4_block_truncate_page(handle, mapping, inode->i_size); 4201 page_len = PAGE_CACHE_SIZE -
4202 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4203
4204 err = ext4_discard_partial_page_buffers(handle,
4205 mapping, inode->i_size, page_len, 0);
4206
4207 if (err)
4208 goto out_stop;
4209 }
3670 4210
3671 if (ext4_orphan_add(handle, inode)) 4211 if (ext4_orphan_add(handle, inode))
3672 goto out_stop; 4212 goto out_stop;
@@ -3760,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3760 int ret = 0; 4300 int ret = 0;
3761 int ret2 = 0; 4301 int ret2 = 0;
3762 int retries = 0; 4302 int retries = 0;
4303 int flags;
3763 struct ext4_map_blocks map; 4304 struct ext4_map_blocks map;
3764 unsigned int credits, blkbits = inode->i_blkbits; 4305 unsigned int credits, blkbits = inode->i_blkbits;
3765 4306
@@ -3796,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3796 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4337 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3797 return ret; 4338 return ret;
3798 } 4339 }
4340 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4341 if (mode & FALLOC_FL_KEEP_SIZE)
4342 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4343 /*
4344 * Don't normalize the request if it can fit in one extent so
4345 * that it doesn't get unnecessarily split into multiple
4346 * extents.
4347 */
4348 if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4349 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
3799retry: 4350retry:
3800 while (ret >= 0 && ret < max_blocks) { 4351 while (ret >= 0 && ret < max_blocks) {
3801 map.m_lblk = map.m_lblk + ret; 4352 map.m_lblk = map.m_lblk + ret;
@@ -3805,9 +4356,7 @@ retry:
3805 ret = PTR_ERR(handle); 4356 ret = PTR_ERR(handle);
3806 break; 4357 break;
3807 } 4358 }
3808 ret = ext4_map_blocks(handle, inode, &map, 4359 ret = ext4_map_blocks(handle, inode, &map, flags);
3809 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3810 EXT4_GET_BLOCKS_NO_NORMALIZE);
3811 if (ret <= 0) { 4360 if (ret <= 0) {
3812#ifdef EXT4FS_DEBUG 4361#ifdef EXT4FS_DEBUG
3813 WARN_ON(ret <= 0); 4362 WARN_ON(ret <= 0);
@@ -4102,7 +4651,6 @@ found_delayed_extent:
4102 return EXT_BREAK; 4651 return EXT_BREAK;
4103 return EXT_CONTINUE; 4652 return EXT_CONTINUE;
4104} 4653}
4105
4106/* fiemap flags we can handle specified here */ 4654/* fiemap flags we can handle specified here */
4107#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4655#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4108 4656
@@ -4162,17 +4710,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4162 struct address_space *mapping = inode->i_mapping; 4710 struct address_space *mapping = inode->i_mapping;
4163 struct ext4_map_blocks map; 4711 struct ext4_map_blocks map;
4164 handle_t *handle; 4712 handle_t *handle;
4165 loff_t first_block_offset, last_block_offset, block_len; 4713 loff_t first_page, last_page, page_len;
4166 loff_t first_page, last_page, first_page_offset, last_page_offset; 4714 loff_t first_page_offset, last_page_offset;
4167 int ret, credits, blocks_released, err = 0; 4715 int ret, credits, blocks_released, err = 0;
4168 4716
4717 /* No need to punch hole beyond i_size */
4718 if (offset >= inode->i_size)
4719 return 0;
4720
4721 /*
4722 * If the hole extends beyond i_size, set the hole
4723 * to end after the page that contains i_size
4724 */
4725 if (offset + length > inode->i_size) {
4726 length = inode->i_size +
4727 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4728 offset;
4729 }
4730
4169 first_block = (offset + sb->s_blocksize - 1) >> 4731 first_block = (offset + sb->s_blocksize - 1) >>
4170 EXT4_BLOCK_SIZE_BITS(sb); 4732 EXT4_BLOCK_SIZE_BITS(sb);
4171 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4733 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4172 4734
4173 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4174 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4175
4176 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4735 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4177 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4736 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4178 4737
@@ -4185,11 +4744,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4185 */ 4744 */
4186 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4745 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4187 err = filemap_write_and_wait_range(mapping, 4746 err = filemap_write_and_wait_range(mapping,
4188 first_page_offset == 0 ? 0 : first_page_offset-1, 4747 offset, offset + length - 1);
4189 last_page_offset);
4190 4748
4191 if (err) 4749 if (err)
4192 return err; 4750 return err;
4193 } 4751 }
4194 4752
4195 /* Now release the pages */ 4753 /* Now release the pages */
@@ -4211,24 +4769,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4211 goto out; 4769 goto out;
4212 4770
4213 /* 4771 /*
4214 * Now we need to zero out the un block aligned data. 4772 * Now we need to zero out the non-page-aligned data in the
4215 * If the file is smaller than a block, just 4773 * pages at the start and tail of the hole, and unmap the buffer
4216 * zero out the middle 4774 * heads for the block aligned regions of the page that were
4775 * completely zeroed.
4217 */ 4776 */
4218 if (first_block > last_block) 4777 if (first_page > last_page) {
4219 ext4_block_zero_page_range(handle, mapping, offset, length); 4778 /*
4220 else { 4779 * If the file space being truncated is contained within a page
4221 /* zero out the head of the hole before the first block */ 4780 * just zero out and unmap the middle of that page
4222 block_len = first_block_offset - offset; 4781 */
4223 if (block_len > 0) 4782 err = ext4_discard_partial_page_buffers(handle,
4224 ext4_block_zero_page_range(handle, mapping, 4783 mapping, offset, length, 0);
4225 offset, block_len); 4784
4226 4785 if (err)
4227 /* zero out the tail of the hole after the last block */ 4786 goto out;
4228 block_len = offset + length - last_block_offset; 4787 } else {
4229 if (block_len > 0) { 4788 /*
4230 ext4_block_zero_page_range(handle, mapping, 4789 * zero out and unmap the partial page that contains
4231 last_block_offset, block_len); 4790 * the start of the hole
4791 */
4792 page_len = first_page_offset - offset;
4793 if (page_len > 0) {
4794 err = ext4_discard_partial_page_buffers(handle, mapping,
4795 offset, page_len, 0);
4796 if (err)
4797 goto out;
4798 }
4799
4800 /*
4801 * zero out and unmap the partial page that contains
4802 * the end of the hole
4803 */
4804 page_len = offset + length - last_page_offset;
4805 if (page_len > 0) {
4806 err = ext4_discard_partial_page_buffers(handle, mapping,
4807 last_page_offset, page_len, 0);
4808 if (err)
4809 goto out;
4810 }
4811 }
4812
4813
4814 /*
4815 * If i_size is contained in the last page, we need to
4816 * unmap and zero the partial page after i_size
4817 */
4818 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4819 inode->i_size % PAGE_CACHE_SIZE != 0) {
4820
4821 page_len = PAGE_CACHE_SIZE -
4822 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4823
4824 if (page_len > 0) {
4825 err = ext4_discard_partial_page_buffers(handle,
4826 mapping, inode->i_size, page_len, 0);
4827
4828 if (err)
4829 goto out;
4232 } 4830 }
4233 } 4831 }
4234 4832
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eb..cb70f1812a7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
181 path.dentry = mnt->mnt_root; 181 path.dentry = mnt->mnt_root;
182 cp = d_path(&path, buf, sizeof(buf)); 182 cp = d_path(&path, buf, sizeof(buf));
183 if (!IS_ERR(cp)) { 183 if (!IS_ERR(cp)) {
184 memcpy(sbi->s_es->s_last_mounted, cp, 184 strlcpy(sbi->s_es->s_last_mounted, cp,
185 sizeof(sbi->s_es->s_last_mounted)); 185 sizeof(sbi->s_es->s_last_mounted));
186 ext4_mark_super_dirty(sb); 186 ext4_mark_super_dirty(sb);
187 } 187 }
188 } 188 }
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
224 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 224 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
225 else 225 else
226 maxbytes = inode->i_sb->s_maxbytes; 226 maxbytes = inode->i_sb->s_maxbytes;
227 mutex_lock(&inode->i_mutex);
228 switch (origin) {
229 case SEEK_END:
230 offset += inode->i_size;
231 break;
232 case SEEK_CUR:
233 if (offset == 0) {
234 mutex_unlock(&inode->i_mutex);
235 return file->f_pos;
236 }
237 offset += file->f_pos;
238 break;
239 case SEEK_DATA:
240 /*
241 * In the generic case the entire file is data, so as long as
242 * offset isn't at the end of the file then the offset is data.
243 */
244 if (offset >= inode->i_size) {
245 mutex_unlock(&inode->i_mutex);
246 return -ENXIO;
247 }
248 break;
249 case SEEK_HOLE:
250 /*
251 * There is a virtual hole at the end of the file, so as long as
252 * offset isn't i_size or larger, return i_size.
253 */
254 if (offset >= inode->i_size) {
255 mutex_unlock(&inode->i_mutex);
256 return -ENXIO;
257 }
258 offset = inode->i_size;
259 break;
260 }
261
262 if (offset < 0 || offset > maxbytes) {
263 mutex_unlock(&inode->i_mutex);
264 return -EINVAL;
265 }
266
267 if (offset != file->f_pos) {
268 file->f_pos = offset;
269 file->f_version = 0;
270 }
271 mutex_unlock(&inode->i_mutex);
272 227
273 return offset; 228 return generic_file_llseek_size(file, offset, origin, maxbytes);
274} 229}
275 230
276const struct file_operations ext4_file_operations = { 231const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 036f78f7a1e..00a2cb753ef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
75 * to written. 75 * to written.
76 * The function return the number of pending IOs on success. 76 * The function return the number of pending IOs on success.
77 */ 77 */
78extern int ext4_flush_completed_IO(struct inode *inode) 78int ext4_flush_completed_IO(struct inode *inode)
79{ 79{
80 ext4_io_end_t *io; 80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode); 81 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode)
83 int ret = 0; 83 int ret = 0;
84 int ret2 = 0; 84 int ret2 = 0;
85 85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode); 86 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 87 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){ 88 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list);
94 /* 92 /*
95 * Calling ext4_end_io_nolock() to convert completed 93 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written. 94 * IO to written.
@@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode)
107 */ 105 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 106 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io); 107 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0) 108 if (ret < 0)
112 ret2 = ret; 109 ret2 = ret;
113 else 110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
114 list_del_init(&io->list);
115 } 111 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0; 113 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b55..00beb4f9cc4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
78 * allocation, essentially implementing a per-group read-only flag. */ 78 * allocation, essentially implementing a per-group read-only flag. */
79 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 79 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
80 ext4_error(sb, "Checksum bad for group %u", block_group); 80 ext4_error(sb, "Checksum bad for group %u", block_group);
81 ext4_free_blks_set(sb, gdp, 0); 81 ext4_free_group_clusters_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 82 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 83 ext4_itable_unused_set(sb, gdp, 0);
84 memset(bh->b_data, 0xff, sb->s_blocksize); 84 memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -293,121 +293,9 @@ error_return:
293 ext4_std_error(sb, fatal); 293 ext4_std_error(sb, fatal);
294} 294}
295 295
296/*
297 * There are two policies for allocating an inode. If the new inode is
298 * a directory, then a forward search is made for a block group with both
299 * free space and a low directory-to-inode ratio; if that fails, then of
300 * the groups with above-average free space, that group with the fewest
301 * directories already is chosen.
302 *
303 * For other inodes, search forward from the parent directory\'s block
304 * group to find a free inode.
305 */
306static int find_group_dir(struct super_block *sb, struct inode *parent,
307 ext4_group_t *best_group)
308{
309 ext4_group_t ngroups = ext4_get_groups_count(sb);
310 unsigned int freei, avefreei;
311 struct ext4_group_desc *desc, *best_desc = NULL;
312 ext4_group_t group;
313 int ret = -1;
314
315 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
316 avefreei = freei / ngroups;
317
318 for (group = 0; group < ngroups; group++) {
319 desc = ext4_get_group_desc(sb, group, NULL);
320 if (!desc || !ext4_free_inodes_count(sb, desc))
321 continue;
322 if (ext4_free_inodes_count(sb, desc) < avefreei)
323 continue;
324 if (!best_desc ||
325 (ext4_free_blks_count(sb, desc) >
326 ext4_free_blks_count(sb, best_desc))) {
327 *best_group = group;
328 best_desc = desc;
329 ret = 0;
330 }
331 }
332 return ret;
333}
334
335#define free_block_ratio 10
336
337static int find_group_flex(struct super_block *sb, struct inode *parent,
338 ext4_group_t *best_group)
339{
340 struct ext4_sb_info *sbi = EXT4_SB(sb);
341 struct ext4_group_desc *desc;
342 struct flex_groups *flex_group = sbi->s_flex_groups;
343 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
344 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
345 ext4_group_t ngroups = ext4_get_groups_count(sb);
346 int flex_size = ext4_flex_bg_size(sbi);
347 ext4_group_t best_flex = parent_fbg_group;
348 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
349 int flexbg_free_blocks;
350 int flex_freeb_ratio;
351 ext4_group_t n_fbg_groups;
352 ext4_group_t i;
353
354 n_fbg_groups = (ngroups + flex_size - 1) >>
355 sbi->s_log_groups_per_flex;
356
357find_close_to_parent:
358 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
359 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
360 if (atomic_read(&flex_group[best_flex].free_inodes) &&
361 flex_freeb_ratio > free_block_ratio)
362 goto found_flexbg;
363
364 if (best_flex && best_flex == parent_fbg_group) {
365 best_flex--;
366 goto find_close_to_parent;
367 }
368
369 for (i = 0; i < n_fbg_groups; i++) {
370 if (i == parent_fbg_group || i == parent_fbg_group - 1)
371 continue;
372
373 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
374 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
375
376 if (flex_freeb_ratio > free_block_ratio &&
377 (atomic_read(&flex_group[i].free_inodes))) {
378 best_flex = i;
379 goto found_flexbg;
380 }
381
382 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
383 ((atomic_read(&flex_group[i].free_blocks) >
384 atomic_read(&flex_group[best_flex].free_blocks)) &&
385 atomic_read(&flex_group[i].free_inodes)))
386 best_flex = i;
387 }
388
389 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
390 !atomic_read(&flex_group[best_flex].free_blocks))
391 return -1;
392
393found_flexbg:
394 for (i = best_flex * flex_size; i < ngroups &&
395 i < (best_flex + 1) * flex_size; i++) {
396 desc = ext4_get_group_desc(sb, i, NULL);
397 if (ext4_free_inodes_count(sb, desc)) {
398 *best_group = i;
399 goto out;
400 }
401 }
402
403 return -1;
404out:
405 return 0;
406}
407
408struct orlov_stats { 296struct orlov_stats {
409 __u32 free_inodes; 297 __u32 free_inodes;
410 __u32 free_blocks; 298 __u32 free_clusters;
411 __u32 used_dirs; 299 __u32 used_dirs;
412}; 300};
413 301
@@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
424 312
425 if (flex_size > 1) { 313 if (flex_size > 1) {
426 stats->free_inodes = atomic_read(&flex_group[g].free_inodes); 314 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
427 stats->free_blocks = atomic_read(&flex_group[g].free_blocks); 315 stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
428 stats->used_dirs = atomic_read(&flex_group[g].used_dirs); 316 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
429 return; 317 return;
430 } 318 }
@@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
432 desc = ext4_get_group_desc(sb, g, NULL); 320 desc = ext4_get_group_desc(sb, g, NULL);
433 if (desc) { 321 if (desc) {
434 stats->free_inodes = ext4_free_inodes_count(sb, desc); 322 stats->free_inodes = ext4_free_inodes_count(sb, desc);
435 stats->free_blocks = ext4_free_blks_count(sb, desc); 323 stats->free_clusters = ext4_free_group_clusters(sb, desc);
436 stats->used_dirs = ext4_used_dirs_count(sb, desc); 324 stats->used_dirs = ext4_used_dirs_count(sb, desc);
437 } else { 325 } else {
438 stats->free_inodes = 0; 326 stats->free_inodes = 0;
439 stats->free_blocks = 0; 327 stats->free_clusters = 0;
440 stats->used_dirs = 0; 328 stats->used_dirs = 0;
441 } 329 }
442} 330}
@@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
471 ext4_group_t real_ngroups = ext4_get_groups_count(sb); 359 ext4_group_t real_ngroups = ext4_get_groups_count(sb);
472 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 360 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
473 unsigned int freei, avefreei; 361 unsigned int freei, avefreei;
474 ext4_fsblk_t freeb, avefreeb; 362 ext4_fsblk_t freeb, avefreec;
475 unsigned int ndirs; 363 unsigned int ndirs;
476 int max_dirs, min_inodes; 364 int max_dirs, min_inodes;
477 ext4_grpblk_t min_blocks; 365 ext4_grpblk_t min_clusters;
478 ext4_group_t i, grp, g, ngroups; 366 ext4_group_t i, grp, g, ngroups;
479 struct ext4_group_desc *desc; 367 struct ext4_group_desc *desc;
480 struct orlov_stats stats; 368 struct orlov_stats stats;
@@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
490 378
491 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 379 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
492 avefreei = freei / ngroups; 380 avefreei = freei / ngroups;
493 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 381 freeb = EXT4_C2B(sbi,
494 avefreeb = freeb; 382 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
495 do_div(avefreeb, ngroups); 383 avefreec = freeb;
384 do_div(avefreec, ngroups);
496 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 385 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
497 386
498 if (S_ISDIR(mode) && 387 if (S_ISDIR(mode) &&
@@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
518 continue; 407 continue;
519 if (stats.free_inodes < avefreei) 408 if (stats.free_inodes < avefreei)
520 continue; 409 continue;
521 if (stats.free_blocks < avefreeb) 410 if (stats.free_clusters < avefreec)
522 continue; 411 continue;
523 grp = g; 412 grp = g;
524 ret = 0; 413 ret = 0;
@@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
556 min_inodes = avefreei - inodes_per_group*flex_size / 4; 445 min_inodes = avefreei - inodes_per_group*flex_size / 4;
557 if (min_inodes < 1) 446 if (min_inodes < 1)
558 min_inodes = 1; 447 min_inodes = 1;
559 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; 448 min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
560 449
561 /* 450 /*
562 * Start looking in the flex group where we last allocated an 451 * Start looking in the flex group where we last allocated an
@@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
575 continue; 464 continue;
576 if (stats.free_inodes < min_inodes) 465 if (stats.free_inodes < min_inodes)
577 continue; 466 continue;
578 if (stats.free_blocks < min_blocks) 467 if (stats.free_clusters < min_clusters)
579 continue; 468 continue;
580 goto found_flex_bg; 469 goto found_flex_bg;
581 } 470 }
@@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
659 *group = parent_group; 548 *group = parent_group;
660 desc = ext4_get_group_desc(sb, *group, NULL); 549 desc = ext4_get_group_desc(sb, *group, NULL);
661 if (desc && ext4_free_inodes_count(sb, desc) && 550 if (desc && ext4_free_inodes_count(sb, desc) &&
662 ext4_free_blks_count(sb, desc)) 551 ext4_free_group_clusters(sb, desc))
663 return 0; 552 return 0;
664 553
665 /* 554 /*
@@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
683 *group -= ngroups; 572 *group -= ngroups;
684 desc = ext4_get_group_desc(sb, *group, NULL); 573 desc = ext4_get_group_desc(sb, *group, NULL);
685 if (desc && ext4_free_inodes_count(sb, desc) && 574 if (desc && ext4_free_inodes_count(sb, desc) &&
686 ext4_free_blks_count(sb, desc)) 575 ext4_free_group_clusters(sb, desc))
687 return 0; 576 return 0;
688 } 577 }
689 578
@@ -802,7 +691,7 @@ err_ret:
802 * group to find a free inode. 691 * group to find a free inode.
803 */ 692 */
804struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, 693struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
805 const struct qstr *qstr, __u32 goal) 694 const struct qstr *qstr, __u32 goal, uid_t *owner)
806{ 695{
807 struct super_block *sb; 696 struct super_block *sb;
808 struct buffer_head *inode_bitmap_bh = NULL; 697 struct buffer_head *inode_bitmap_bh = NULL;
@@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
816 int ret2, err = 0; 705 int ret2, err = 0;
817 struct inode *ret; 706 struct inode *ret;
818 ext4_group_t i; 707 ext4_group_t i;
819 int free = 0;
820 static int once = 1;
821 ext4_group_t flex_group; 708 ext4_group_t flex_group;
822 709
823 /* Cannot create files in a deleted directory */ 710 /* Cannot create files in a deleted directory */
@@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
843 goto got_group; 730 goto got_group;
844 } 731 }
845 732
846 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { 733 if (S_ISDIR(mode))
847 ret2 = find_group_flex(sb, dir, &group); 734 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
848 if (ret2 == -1) { 735 else
849 ret2 = find_group_other(sb, dir, &group, mode);
850 if (ret2 == 0 && once) {
851 once = 0;
852 printk(KERN_NOTICE "ext4: find_group_flex "
853 "failed, fallback succeeded dir %lu\n",
854 dir->i_ino);
855 }
856 }
857 goto got_group;
858 }
859
860 if (S_ISDIR(mode)) {
861 if (test_opt(sb, OLDALLOC))
862 ret2 = find_group_dir(sb, dir, &group);
863 else
864 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
865 } else
866 ret2 = find_group_other(sb, dir, &group, mode); 736 ret2 = find_group_other(sb, dir, &group, mode);
867 737
868got_group: 738got_group:
@@ -950,26 +820,21 @@ got:
950 goto fail; 820 goto fail;
951 } 821 }
952 822
953 free = 0; 823 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
954 ext4_lock_group(sb, group); 824 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
825 brelse(block_bitmap_bh);
826
955 /* recheck and clear flag under lock if we still need to */ 827 /* recheck and clear flag under lock if we still need to */
828 ext4_lock_group(sb, group);
956 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 829 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
957 free = ext4_free_blocks_after_init(sb, group, gdp);
958 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 830 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
959 ext4_free_blks_set(sb, gdp, free); 831 ext4_free_group_clusters_set(sb, gdp,
832 ext4_free_clusters_after_init(sb, group, gdp));
960 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 833 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
961 gdp); 834 gdp);
962 } 835 }
963 ext4_unlock_group(sb, group); 836 ext4_unlock_group(sb, group);
964 837
965 /* Don't need to dirty bitmap block if we didn't change it */
966 if (free) {
967 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
968 err = ext4_handle_dirty_metadata(handle,
969 NULL, block_bitmap_bh);
970 }
971
972 brelse(block_bitmap_bh);
973 if (err) 838 if (err)
974 goto fail; 839 goto fail;
975 } 840 }
@@ -987,8 +852,11 @@ got:
987 flex_group = ext4_flex_group(sbi, group); 852 flex_group = ext4_flex_group(sbi, group);
988 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 853 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
989 } 854 }
990 855 if (owner) {
991 if (test_opt(sb, GRPID)) { 856 inode->i_mode = mode;
857 inode->i_uid = owner[0];
858 inode->i_gid = owner[1];
859 } else if (test_opt(sb, GRPID)) {
992 inode->i_mode = mode; 860 inode->i_mode = mode;
993 inode->i_uid = current_fsuid(); 861 inode->i_uid = current_fsuid();
994 inode->i_gid = dir->i_gid; 862 inode->i_gid = dir->i_gid;
@@ -1005,11 +873,7 @@ got:
1005 ei->i_dir_start_lookup = 0; 873 ei->i_dir_start_lookup = 0;
1006 ei->i_disksize = 0; 874 ei->i_disksize = 0;
1007 875
1008 /* 876 /* Don't inherit extent flag from directory, amongst others. */
1009 * Don't inherit extent flag from directory, amongst others. We set
1010 * extent flag on newly created directory and file only if -o extent
1011 * mount option is specified
1012 */
1013 ei->i_flags = 877 ei->i_flags =
1014 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); 878 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
1015 ei->i_file_acl = 0; 879 ei->i_file_acl = 0;
@@ -1084,7 +948,7 @@ fail_free_drop:
1084fail_drop: 948fail_drop:
1085 dquot_drop(inode); 949 dquot_drop(inode);
1086 inode->i_flags |= S_NOQUOTA; 950 inode->i_flags |= S_NOQUOTA;
1087 inode->i_nlink = 0; 951 clear_nlink(inode);
1088 unlock_new_inode(inode); 952 unlock_new_inode(inode);
1089 iput(inode); 953 iput(inode);
1090 brelse(inode_bitmap_bh); 954 brelse(inode_bitmap_bh);
@@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1235 * inode allocation from the current group, so we take alloc_sem lock, to 1099 * inode allocation from the current group, so we take alloc_sem lock, to
1236 * block ext4_claim_inode until we are finished. 1100 * block ext4_claim_inode until we are finished.
1237 */ 1101 */
1238extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, 1102int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1239 int barrier) 1103 int barrier)
1240{ 1104{
1241 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1105 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c..3cfc73fbca8 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
699 /* 699 /*
700 * Okay, we need to do block allocation. 700 * Okay, we need to do block allocation.
701 */ 701 */
702 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
703 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
704 EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
705 "non-extent mapped inodes with bigalloc");
706 return -ENOSPC;
707 }
708
702 goal = ext4_find_goal(inode, map->m_lblk, partial); 709 goal = ext4_find_goal(inode, map->m_lblk, partial);
703 710
704 /* the number of blocks need to allocate for [d,t]indirect blocks */ 711 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1343,7 +1350,9 @@ void ext4_ind_truncate(struct inode *inode)
1343 __le32 nr = 0; 1350 __le32 nr = 0;
1344 int n = 0; 1351 int n = 0;
1345 ext4_lblk_t last_block, max_block; 1352 ext4_lblk_t last_block, max_block;
1353 loff_t page_len;
1346 unsigned blocksize = inode->i_sb->s_blocksize; 1354 unsigned blocksize = inode->i_sb->s_blocksize;
1355 int err;
1347 1356
1348 handle = start_transaction(inode); 1357 handle = start_transaction(inode);
1349 if (IS_ERR(handle)) 1358 if (IS_ERR(handle))
@@ -1354,9 +1363,16 @@ void ext4_ind_truncate(struct inode *inode)
1354 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1363 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1355 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1364 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1356 1365
1357 if (inode->i_size & (blocksize - 1)) 1366 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
1358 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 1367 page_len = PAGE_CACHE_SIZE -
1368 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1369
1370 err = ext4_discard_partial_page_buffers(handle,
1371 mapping, inode->i_size, page_len, 0);
1372
1373 if (err)
1359 goto out_stop; 1374 goto out_stop;
1375 }
1360 1376
1361 if (last_block != max_block) { 1377 if (last_block != max_block) {
1362 n = ext4_block_to_path(inode, last_block, offsets, NULL); 1378 n = ext4_block_to_path(inode, last_block, offsets, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f03..240f6e2dc7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
42#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
43#include "xattr.h" 43#include "xattr.h"
44#include "acl.h" 44#include "acl.h"
45#include "ext4_extents.h"
46#include "truncate.h" 45#include "truncate.h"
47 46
48#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
268 struct ext4_inode_info *ei = EXT4_I(inode); 267 struct ext4_inode_info *ei = EXT4_I(inode);
269 268
270 spin_lock(&ei->i_block_reservation_lock); 269 spin_lock(&ei->i_block_reservation_lock);
271 trace_ext4_da_update_reserve_space(inode, used); 270 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
272 if (unlikely(used > ei->i_reserved_data_blocks)) { 271 if (unlikely(used > ei->i_reserved_data_blocks)) {
273 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 272 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
274 "with only %d reserved data blocks\n", 273 "with only %d reserved data blocks\n",
@@ -281,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
281 /* Update per-inode reservations */ 280 /* Update per-inode reservations */
282 ei->i_reserved_data_blocks -= used; 281 ei->i_reserved_data_blocks -= used;
283 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 282 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
284 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 283 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
285 used + ei->i_allocated_meta_blocks); 284 used + ei->i_allocated_meta_blocks);
286 ei->i_allocated_meta_blocks = 0; 285 ei->i_allocated_meta_blocks = 0;
287 286
@@ -291,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
291 * only when we have written all of the delayed 290 * only when we have written all of the delayed
292 * allocation blocks. 291 * allocation blocks.
293 */ 292 */
294 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 293 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
295 ei->i_reserved_meta_blocks); 294 ei->i_reserved_meta_blocks);
296 ei->i_reserved_meta_blocks = 0; 295 ei->i_reserved_meta_blocks = 0;
297 ei->i_da_metadata_calc_len = 0; 296 ei->i_da_metadata_calc_len = 0;
@@ -300,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
300 299
301 /* Update quota subsystem for data blocks */ 300 /* Update quota subsystem for data blocks */
302 if (quota_claim) 301 if (quota_claim)
303 dquot_claim_block(inode, used); 302 dquot_claim_block(inode, EXT4_C2B(sbi, used));
304 else { 303 else {
305 /* 304 /*
306 * We did fallocate with an offset that is already delayed 305 * We did fallocate with an offset that is already delayed
307 * allocated. So on delayed allocated writeback we should 306 * allocated. So on delayed allocated writeback we should
308 * not re-claim the quota for fallocated blocks. 307 * not re-claim the quota for fallocated blocks.
309 */ 308 */
310 dquot_release_reservation_block(inode, used); 309 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
311 } 310 }
312 311
313 /* 312 /*
@@ -399,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
399} 398}
400 399
401/* 400/*
401 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
402 */
403static void set_buffers_da_mapped(struct inode *inode,
404 struct ext4_map_blocks *map)
405{
406 struct address_space *mapping = inode->i_mapping;
407 struct pagevec pvec;
408 int i, nr_pages;
409 pgoff_t index, end;
410
411 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
412 end = (map->m_lblk + map->m_len - 1) >>
413 (PAGE_CACHE_SHIFT - inode->i_blkbits);
414
415 pagevec_init(&pvec, 0);
416 while (index <= end) {
417 nr_pages = pagevec_lookup(&pvec, mapping, index,
418 min(end - index + 1,
419 (pgoff_t)PAGEVEC_SIZE));
420 if (nr_pages == 0)
421 break;
422 for (i = 0; i < nr_pages; i++) {
423 struct page *page = pvec.pages[i];
424 struct buffer_head *bh, *head;
425
426 if (unlikely(page->mapping != mapping) ||
427 !PageDirty(page))
428 break;
429
430 if (page_has_buffers(page)) {
431 bh = head = page_buffers(page);
432 do {
433 set_buffer_da_mapped(bh);
434 bh = bh->b_this_page;
435 } while (bh != head);
436 }
437 index++;
438 }
439 pagevec_release(&pvec);
440 }
441}
442
443/*
402 * The ext4_map_blocks() function tries to look up the requested blocks, 444 * The ext4_map_blocks() function tries to look up the requested blocks,
403 * and returns if the blocks are already mapped. 445 * and returns if the blocks are already mapped.
404 * 446 *
@@ -416,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
416 * the buffer head is mapped. 458 * the buffer head is mapped.
417 * 459 *
418 * It returns 0 if plain look up failed (blocks have not been allocated), in 460 * It returns 0 if plain look up failed (blocks have not been allocated), in
419 * that casem, buffer head is unmapped 461 * that case, buffer head is unmapped
420 * 462 *
421 * It returns the error in case of allocation failure. 463 * It returns the error in case of allocation failure.
422 */ 464 */
@@ -435,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
435 */ 477 */
436 down_read((&EXT4_I(inode)->i_data_sem)); 478 down_read((&EXT4_I(inode)->i_data_sem));
437 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 479 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
438 retval = ext4_ext_map_blocks(handle, inode, map, 0); 480 retval = ext4_ext_map_blocks(handle, inode, map, flags &
481 EXT4_GET_BLOCKS_KEEP_SIZE);
439 } else { 482 } else {
440 retval = ext4_ind_map_blocks(handle, inode, map, 0); 483 retval = ext4_ind_map_blocks(handle, inode, map, flags &
484 EXT4_GET_BLOCKS_KEEP_SIZE);
441 } 485 }
442 up_read((&EXT4_I(inode)->i_data_sem)); 486 up_read((&EXT4_I(inode)->i_data_sem));
443 487
@@ -455,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
455 * Returns if the blocks have already allocated 499 * Returns if the blocks have already allocated
456 * 500 *
457 * Note that if blocks have been preallocated 501 * Note that if blocks have been preallocated
458 * ext4_ext_get_block() returns th create = 0 502 * ext4_ext_get_block() returns the create = 0
459 * with buffer head unmapped. 503 * with buffer head unmapped.
460 */ 504 */
461 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 505 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -517,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
517 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 561 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
518 ext4_da_update_reserve_space(inode, retval, 1); 562 ext4_da_update_reserve_space(inode, retval, 1);
519 } 563 }
520 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 564 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
521 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 565 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
522 566
567 /* If we have successfully mapped the delayed allocated blocks,
568 * set the BH_Da_Mapped bit on them. Its important to do this
569 * under the protection of i_data_sem.
570 */
571 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
572 set_buffers_da_mapped(inode, map);
573 }
574
523 up_write((&EXT4_I(inode)->i_data_sem)); 575 up_write((&EXT4_I(inode)->i_data_sem));
524 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 576 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
525 int ret = check_block_validity(inode, map); 577 int ret = check_block_validity(inode, map);
@@ -909,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
909 ext4_orphan_add(handle, inode); 961 ext4_orphan_add(handle, inode);
910 if (ret2 < 0) 962 if (ret2 < 0)
911 ret = ret2; 963 ret = ret2;
964 } else {
965 unlock_page(page);
966 page_cache_release(page);
912 } 967 }
968
913 ret2 = ext4_journal_stop(handle); 969 ret2 = ext4_journal_stop(handle);
914 if (!ret) 970 if (!ret)
915 ret = ret2; 971 ret = ret2;
@@ -1037,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
1037} 1093}
1038 1094
1039/* 1095/*
1040 * Reserve a single block located at lblock 1096 * Reserve a single cluster located at lblock
1041 */ 1097 */
1042static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1098static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1043{ 1099{
1044 int retries = 0; 1100 int retries = 0;
1045 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1101 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1046 struct ext4_inode_info *ei = EXT4_I(inode); 1102 struct ext4_inode_info *ei = EXT4_I(inode);
1047 unsigned long md_needed; 1103 unsigned int md_needed;
1048 int ret; 1104 int ret;
1049 1105
1050 /* 1106 /*
@@ -1054,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1054 */ 1110 */
1055repeat: 1111repeat:
1056 spin_lock(&ei->i_block_reservation_lock); 1112 spin_lock(&ei->i_block_reservation_lock);
1057 md_needed = ext4_calc_metadata_amount(inode, lblock); 1113 md_needed = EXT4_NUM_B2C(sbi,
1114 ext4_calc_metadata_amount(inode, lblock));
1058 trace_ext4_da_reserve_space(inode, md_needed); 1115 trace_ext4_da_reserve_space(inode, md_needed);
1059 spin_unlock(&ei->i_block_reservation_lock); 1116 spin_unlock(&ei->i_block_reservation_lock);
1060 1117
@@ -1063,15 +1120,15 @@ repeat:
1063 * us from metadata over-estimation, though we may go over by 1120 * us from metadata over-estimation, though we may go over by
1064 * a small amount in the end. Here we just reserve for data. 1121 * a small amount in the end. Here we just reserve for data.
1065 */ 1122 */
1066 ret = dquot_reserve_block(inode, 1); 1123 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1067 if (ret) 1124 if (ret)
1068 return ret; 1125 return ret;
1069 /* 1126 /*
1070 * We do still charge estimated metadata to the sb though; 1127 * We do still charge estimated metadata to the sb though;
1071 * we cannot afford to run out of free blocks. 1128 * we cannot afford to run out of free blocks.
1072 */ 1129 */
1073 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { 1130 if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1074 dquot_release_reservation_block(inode, 1); 1131 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1075 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1132 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1076 yield(); 1133 yield();
1077 goto repeat; 1134 goto repeat;
@@ -1118,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1118 * We can release all of the reserved metadata blocks 1175 * We can release all of the reserved metadata blocks
1119 * only when we have written all of the delayed 1176 * only when we have written all of the delayed
1120 * allocation blocks. 1177 * allocation blocks.
1178 * Note that in case of bigalloc, i_reserved_meta_blocks,
1179 * i_reserved_data_blocks, etc. refer to number of clusters.
1121 */ 1180 */
1122 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1181 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1123 ei->i_reserved_meta_blocks); 1182 ei->i_reserved_meta_blocks);
1124 ei->i_reserved_meta_blocks = 0; 1183 ei->i_reserved_meta_blocks = 0;
1125 ei->i_da_metadata_calc_len = 0; 1184 ei->i_da_metadata_calc_len = 0;
1126 } 1185 }
1127 1186
1128 /* update fs dirty data blocks counter */ 1187 /* update fs dirty data blocks counter */
1129 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1188 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1130 1189
1131 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1190 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1132 1191
1133 dquot_release_reservation_block(inode, to_free); 1192 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1134} 1193}
1135 1194
1136static void ext4_da_page_release_reservation(struct page *page, 1195static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
1139 int to_release = 0; 1198 int to_release = 0;
1140 struct buffer_head *head, *bh; 1199 struct buffer_head *head, *bh;
1141 unsigned int curr_off = 0; 1200 unsigned int curr_off = 0;
1201 struct inode *inode = page->mapping->host;
1202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1203 int num_clusters;
1142 1204
1143 head = page_buffers(page); 1205 head = page_buffers(page);
1144 bh = head; 1206 bh = head;
@@ -1148,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
1148 if ((offset <= curr_off) && (buffer_delay(bh))) { 1210 if ((offset <= curr_off) && (buffer_delay(bh))) {
1149 to_release++; 1211 to_release++;
1150 clear_buffer_delay(bh); 1212 clear_buffer_delay(bh);
1213 clear_buffer_da_mapped(bh);
1151 } 1214 }
1152 curr_off = next_off; 1215 curr_off = next_off;
1153 } while ((bh = bh->b_this_page) != head); 1216 } while ((bh = bh->b_this_page) != head);
1154 ext4_da_release_space(page->mapping->host, to_release); 1217
1218 /* If we have released all the blocks belonging to a cluster, then we
1219 * need to release the reserved space for that cluster. */
1220 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1221 while (num_clusters > 0) {
1222 ext4_fsblk_t lblk;
1223 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1224 ((num_clusters - 1) << sbi->s_cluster_bits);
1225 if (sbi->s_cluster_ratio == 1 ||
1226 !ext4_find_delalloc_cluster(inode, lblk, 1))
1227 ext4_da_release_space(inode, 1);
1228
1229 num_clusters--;
1230 }
1155} 1231}
1156 1232
1157/* 1233/*
@@ -1253,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1253 clear_buffer_delay(bh); 1329 clear_buffer_delay(bh);
1254 bh->b_blocknr = pblock; 1330 bh->b_blocknr = pblock;
1255 } 1331 }
1332 if (buffer_da_mapped(bh))
1333 clear_buffer_da_mapped(bh);
1256 if (buffer_unwritten(bh) || 1334 if (buffer_unwritten(bh) ||
1257 buffer_mapped(bh)) 1335 buffer_mapped(bh))
1258 BUG_ON(bh->b_blocknr != pblock); 1336 BUG_ON(bh->b_blocknr != pblock);
@@ -1346,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode)
1346{ 1424{
1347 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1425 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1348 printk(KERN_CRIT "Total free blocks count %lld\n", 1426 printk(KERN_CRIT "Total free blocks count %lld\n",
1349 ext4_count_free_blocks(inode->i_sb)); 1427 EXT4_C2B(EXT4_SB(inode->i_sb),
1428 ext4_count_free_clusters(inode->i_sb)));
1350 printk(KERN_CRIT "Free/Dirty block details\n"); 1429 printk(KERN_CRIT "Free/Dirty block details\n");
1351 printk(KERN_CRIT "free_blocks=%lld\n", 1430 printk(KERN_CRIT "free_blocks=%lld\n",
1352 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); 1431 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1432 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1353 printk(KERN_CRIT "dirty_blocks=%lld\n", 1433 printk(KERN_CRIT "dirty_blocks=%lld\n",
1354 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1434 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1435 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1355 printk(KERN_CRIT "Block reservation details\n"); 1436 printk(KERN_CRIT "Block reservation details\n");
1356 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 1437 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
1357 EXT4_I(inode)->i_reserved_data_blocks); 1438 EXT4_I(inode)->i_reserved_data_blocks);
@@ -1430,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1430 if (err == -EAGAIN) 1511 if (err == -EAGAIN)
1431 goto submit_io; 1512 goto submit_io;
1432 1513
1433 if (err == -ENOSPC && 1514 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1434 ext4_count_free_blocks(sb)) {
1435 mpd->retval = err; 1515 mpd->retval = err;
1436 goto submit_io; 1516 goto submit_io;
1437 } 1517 }
@@ -1471,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1471 1551
1472 for (i = 0; i < map.m_len; i++) 1552 for (i = 0; i < map.m_len; i++)
1473 unmap_underlying_metadata(bdev, map.m_pblk + i); 1553 unmap_underlying_metadata(bdev, map.m_pblk + i);
1474 }
1475 1554
1476 if (ext4_should_order_data(mpd->inode)) { 1555 if (ext4_should_order_data(mpd->inode)) {
1477 err = ext4_jbd2_file_inode(handle, mpd->inode); 1556 err = ext4_jbd2_file_inode(handle, mpd->inode);
1478 if (err) 1557 if (err) {
1479 /* This only happens if the journal is aborted */ 1558 /* Only if the journal is aborted */
1480 return; 1559 mpd->retval = err;
1560 goto submit_io;
1561 }
1562 }
1481 } 1563 }
1482 1564
1483 /* 1565 /*
@@ -1584,6 +1666,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1584} 1666}
1585 1667
1586/* 1668/*
1669 * This function is grabs code from the very beginning of
1670 * ext4_map_blocks, but assumes that the caller is from delayed write
1671 * time. This function looks up the requested blocks and sets the
1672 * buffer delay bit under the protection of i_data_sem.
1673 */
1674static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1675 struct ext4_map_blocks *map,
1676 struct buffer_head *bh)
1677{
1678 int retval;
1679 sector_t invalid_block = ~((sector_t) 0xffff);
1680
1681 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1682 invalid_block = ~0;
1683
1684 map->m_flags = 0;
1685 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1686 "logical block %lu\n", inode->i_ino, map->m_len,
1687 (unsigned long) map->m_lblk);
1688 /*
1689 * Try to see if we can get the block without requesting a new
1690 * file system block.
1691 */
1692 down_read((&EXT4_I(inode)->i_data_sem));
1693 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1694 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1695 else
1696 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1697
1698 if (retval == 0) {
1699 /*
1700 * XXX: __block_prepare_write() unmaps passed block,
1701 * is it OK?
1702 */
1703 /* If the block was allocated from previously allocated cluster,
1704 * then we dont need to reserve it again. */
1705 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1706 retval = ext4_da_reserve_space(inode, iblock);
1707 if (retval)
1708 /* not enough space to reserve */
1709 goto out_unlock;
1710 }
1711
1712 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1713 * and it should not appear on the bh->b_state.
1714 */
1715 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1716
1717 map_bh(bh, inode->i_sb, invalid_block);
1718 set_buffer_new(bh);
1719 set_buffer_delay(bh);
1720 }
1721
1722out_unlock:
1723 up_read((&EXT4_I(inode)->i_data_sem));
1724
1725 return retval;
1726}
1727
1728/*
1587 * This is a special get_blocks_t callback which is used by 1729 * This is a special get_blocks_t callback which is used by
1588 * ext4_da_write_begin(). It will either return mapped block or 1730 * ext4_da_write_begin(). It will either return mapped block or
1589 * reserve space for a single block. 1731 * reserve space for a single block.
@@ -1600,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1600{ 1742{
1601 struct ext4_map_blocks map; 1743 struct ext4_map_blocks map;
1602 int ret = 0; 1744 int ret = 0;
1603 sector_t invalid_block = ~((sector_t) 0xffff);
1604
1605 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1606 invalid_block = ~0;
1607 1745
1608 BUG_ON(create == 0); 1746 BUG_ON(create == 0);
1609 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1747 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1616,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1616 * preallocated blocks are unmapped but should treated 1754 * preallocated blocks are unmapped but should treated
1617 * the same as allocated blocks. 1755 * the same as allocated blocks.
1618 */ 1756 */
1619 ret = ext4_map_blocks(NULL, inode, &map, 0); 1757 ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1620 if (ret < 0) 1758 if (ret <= 0)
1621 return ret; 1759 return ret;
1622 if (ret == 0) {
1623 if (buffer_delay(bh))
1624 return 0; /* Not sure this could or should happen */
1625 /*
1626 * XXX: __block_write_begin() unmaps passed block, is it OK?
1627 */
1628 ret = ext4_da_reserve_space(inode, iblock);
1629 if (ret)
1630 /* not enough space to reserve */
1631 return ret;
1632
1633 map_bh(bh, inode->i_sb, invalid_block);
1634 set_buffer_new(bh);
1635 set_buffer_delay(bh);
1636 return 0;
1637 }
1638 1760
1639 map_bh(bh, inode->i_sb, map.m_pblk); 1761 map_bh(bh, inode->i_sb, map.m_pblk);
1640 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1762 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -1811,8 +1933,12 @@ static int ext4_writepage(struct page *page,
1811 * We don't want to do block allocation, so redirty 1933 * We don't want to do block allocation, so redirty
1812 * the page and return. We may reach here when we do 1934 * the page and return. We may reach here when we do
1813 * a journal commit via journal_submit_inode_data_buffers. 1935 * a journal commit via journal_submit_inode_data_buffers.
1814 * We can also reach here via shrink_page_list 1936 * We can also reach here via shrink_page_list but it
1937 * should never be for direct reclaim so warn if that
1938 * happens
1815 */ 1939 */
1940 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1941 PF_MEMALLOC);
1816 goto redirty_page; 1942 goto redirty_page;
1817 } 1943 }
1818 if (commit_write) 1944 if (commit_write)
@@ -2046,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2046 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2172 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2047 pgoff_t done_index = 0; 2173 pgoff_t done_index = 0;
2048 pgoff_t end; 2174 pgoff_t end;
2175 struct blk_plug plug;
2049 2176
2050 trace_ext4_da_writepages(inode, wbc); 2177 trace_ext4_da_writepages(inode, wbc);
2051 2178
@@ -2124,6 +2251,7 @@ retry:
2124 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2251 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2125 tag_pages_for_writeback(mapping, index, end); 2252 tag_pages_for_writeback(mapping, index, end);
2126 2253
2254 blk_start_plug(&plug);
2127 while (!ret && wbc->nr_to_write > 0) { 2255 while (!ret && wbc->nr_to_write > 0) {
2128 2256
2129 /* 2257 /*
@@ -2174,11 +2302,12 @@ retry:
2174 ret = 0; 2302 ret = 0;
2175 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2303 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2176 /* 2304 /*
2177 * got one extent now try with 2305 * Got one extent now try with rest of the pages.
2178 * rest of the pages 2306 * If mpd.retval is set -EIO, journal is aborted.
2307 * So we don't need to write any more.
2179 */ 2308 */
2180 pages_written += mpd.pages_written; 2309 pages_written += mpd.pages_written;
2181 ret = 0; 2310 ret = mpd.retval;
2182 io_done = 1; 2311 io_done = 1;
2183 } else if (wbc->nr_to_write) 2312 } else if (wbc->nr_to_write)
2184 /* 2313 /*
@@ -2188,6 +2317,7 @@ retry:
2188 */ 2317 */
2189 break; 2318 break;
2190 } 2319 }
2320 blk_finish_plug(&plug);
2191 if (!io_done && !cycled) { 2321 if (!io_done && !cycled) {
2192 cycled = 1; 2322 cycled = 1;
2193 index = 0; 2323 index = 0;
@@ -2226,10 +2356,11 @@ static int ext4_nonda_switch(struct super_block *sb)
2226 * Delalloc need an accurate free block accounting. So switch 2356 * Delalloc need an accurate free block accounting. So switch
2227 * to non delalloc when we are near to error range. 2357 * to non delalloc when we are near to error range.
2228 */ 2358 */
2229 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2359 free_blocks = EXT4_C2B(sbi,
2230 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2360 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2361 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2231 if (2 * free_blocks < 3 * dirty_blocks || 2362 if (2 * free_blocks < 3 * dirty_blocks ||
2232 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2363 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2233 /* 2364 /*
2234 * free block count is less than 150% of dirty blocks 2365 * free block count is less than 150% of dirty blocks
2235 * or free blocks is less than watermark 2366 * or free blocks is less than watermark
@@ -2241,7 +2372,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2241 * start pushing delalloc when 1/2 of free blocks are dirty. 2372 * start pushing delalloc when 1/2 of free blocks are dirty.
2242 */ 2373 */
2243 if (free_blocks < 2 * dirty_blocks) 2374 if (free_blocks < 2 * dirty_blocks)
2244 writeback_inodes_sb_if_idle(sb); 2375 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2245 2376
2246 return 0; 2377 return 0;
2247} 2378}
@@ -2255,6 +2386,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2255 pgoff_t index; 2386 pgoff_t index;
2256 struct inode *inode = mapping->host; 2387 struct inode *inode = mapping->host;
2257 handle_t *handle; 2388 handle_t *handle;
2389 loff_t page_len;
2258 2390
2259 index = pos >> PAGE_CACHE_SHIFT; 2391 index = pos >> PAGE_CACHE_SHIFT;
2260 2392
@@ -2301,6 +2433,13 @@ retry:
2301 */ 2433 */
2302 if (pos + len > inode->i_size) 2434 if (pos + len > inode->i_size)
2303 ext4_truncate_failed_write(inode); 2435 ext4_truncate_failed_write(inode);
2436 } else {
2437 page_len = pos & (PAGE_CACHE_SIZE - 1);
2438 if (page_len > 0) {
2439 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2440 inode, page, pos - page_len, page_len,
2441 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2442 }
2304 } 2443 }
2305 2444
2306 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2445 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2343,6 +2482,7 @@ static int ext4_da_write_end(struct file *file,
2343 loff_t new_i_size; 2482 loff_t new_i_size;
2344 unsigned long start, end; 2483 unsigned long start, end;
2345 int write_mode = (int)(unsigned long)fsdata; 2484 int write_mode = (int)(unsigned long)fsdata;
2485 loff_t page_len;
2346 2486
2347 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2487 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2348 if (ext4_should_order_data(inode)) { 2488 if (ext4_should_order_data(inode)) {
@@ -2391,6 +2531,16 @@ static int ext4_da_write_end(struct file *file,
2391 } 2531 }
2392 ret2 = generic_write_end(file, mapping, pos, len, copied, 2532 ret2 = generic_write_end(file, mapping, pos, len, copied,
2393 page, fsdata); 2533 page, fsdata);
2534
2535 page_len = PAGE_CACHE_SIZE -
2536 ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
2537
2538 if (page_len > 0) {
2539 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2540 inode, page, pos + copied - 1, page_len,
2541 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2542 }
2543
2394 copied = ret2; 2544 copied = ret2;
2395 if (ret2 < 0) 2545 if (ret2 < 0)
2396 ret = ret2; 2546 ret = ret2;
@@ -2685,10 +2835,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2685 * but being more careful is always safe for the future change. 2835 * but being more careful is always safe for the future change.
2686 */ 2836 */
2687 inode = io_end->inode; 2837 inode = io_end->inode;
2688 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 2838 ext4_set_io_unwritten_flag(inode, io_end);
2689 io_end->flag |= EXT4_IO_END_UNWRITTEN;
2690 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
2691 }
2692 2839
2693 /* Add the io_end to per-inode completed io list*/ 2840 /* Add the io_end to per-inode completed io list*/
2694 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2841 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2854,6 +3001,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2854 struct inode *inode = file->f_mapping->host; 3001 struct inode *inode = file->f_mapping->host;
2855 ssize_t ret; 3002 ssize_t ret;
2856 3003
3004 /*
3005 * If we are doing data journalling we don't support O_DIRECT
3006 */
3007 if (ext4_should_journal_data(inode))
3008 return 0;
3009
2857 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3010 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
2858 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3011 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
2859 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3012 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2923,6 +3076,7 @@ static const struct address_space_operations ext4_journalled_aops = {
2923 .bmap = ext4_bmap, 3076 .bmap = ext4_bmap,
2924 .invalidatepage = ext4_invalidatepage, 3077 .invalidatepage = ext4_invalidatepage,
2925 .releasepage = ext4_releasepage, 3078 .releasepage = ext4_releasepage,
3079 .direct_IO = ext4_direct_IO,
2926 .is_partially_uptodate = block_is_partially_uptodate, 3080 .is_partially_uptodate = block_is_partially_uptodate,
2927 .error_remove_page = generic_error_remove_page, 3081 .error_remove_page = generic_error_remove_page,
2928}; 3082};
@@ -2959,6 +3113,227 @@ void ext4_set_aops(struct inode *inode)
2959 inode->i_mapping->a_ops = &ext4_journalled_aops; 3113 inode->i_mapping->a_ops = &ext4_journalled_aops;
2960} 3114}
2961 3115
3116
3117/*
3118 * ext4_discard_partial_page_buffers()
3119 * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
3120 * This function finds and locks the page containing the offset
3121 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
3122 * Calling functions that already have the page locked should call
3123 * ext4_discard_partial_page_buffers_no_lock directly.
3124 */
3125int ext4_discard_partial_page_buffers(handle_t *handle,
3126 struct address_space *mapping, loff_t from,
3127 loff_t length, int flags)
3128{
3129 struct inode *inode = mapping->host;
3130 struct page *page;
3131 int err = 0;
3132
3133 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3134 mapping_gfp_mask(mapping) & ~__GFP_FS);
3135 if (!page)
3136 return -ENOMEM;
3137
3138 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3139 from, length, flags);
3140
3141 unlock_page(page);
3142 page_cache_release(page);
3143 return err;
3144}
3145
3146/*
3147 * ext4_discard_partial_page_buffers_no_lock()
3148 * Zeros a page range of length 'length' starting from offset 'from'.
3149 * Buffer heads that correspond to the block aligned regions of the
3150 * zeroed range will be unmapped. Unblock aligned regions
3151 * will have the corresponding buffer head mapped if needed so that
3152 * that region of the page can be updated with the partial zero out.
3153 *
3154 * This function assumes that the page has already been locked. The
3155 * The range to be discarded must be contained with in the given page.
3156 * If the specified range exceeds the end of the page it will be shortened
3157 * to the end of the page that corresponds to 'from'. This function is
3158 * appropriate for updating a page and it buffer heads to be unmapped and
3159 * zeroed for blocks that have been either released, or are going to be
3160 * released.
3161 *
3162 * handle: The journal handle
3163 * inode: The files inode
3164 * page: A locked page that contains the offset "from"
3165 * from: The starting byte offset (from the begining of the file)
3166 * to begin discarding
3167 * len: The length of bytes to discard
3168 * flags: Optional flags that may be used:
3169 *
3170 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3171 * Only zero the regions of the page whose buffer heads
3172 * have already been unmapped. This flag is appropriate
3173 * for updateing the contents of a page whose blocks may
3174 * have already been released, and we only want to zero
3175 * out the regions that correspond to those released blocks.
3176 *
3177 * Returns zero on sucess or negative on failure.
3178 */
3179int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3180 struct inode *inode, struct page *page, loff_t from,
3181 loff_t length, int flags)
3182{
3183 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3184 unsigned int offset = from & (PAGE_CACHE_SIZE-1);
3185 unsigned int blocksize, max, pos;
3186 ext4_lblk_t iblock;
3187 struct buffer_head *bh;
3188 int err = 0;
3189
3190 blocksize = inode->i_sb->s_blocksize;
3191 max = PAGE_CACHE_SIZE - offset;
3192
3193 if (index != page->index)
3194 return -EINVAL;
3195
3196 /*
3197 * correct length if it does not fall between
3198 * 'from' and the end of the page
3199 */
3200 if (length > max || length < 0)
3201 length = max;
3202
3203 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3204
3205 if (!page_has_buffers(page)) {
3206 /*
3207 * If the range to be discarded covers a partial block
3208 * we need to get the page buffers. This is because
3209 * partial blocks cannot be released and the page needs
3210 * to be updated with the contents of the block before
3211 * we write the zeros on top of it.
3212 */
3213 if ((from & (blocksize - 1)) ||
3214 ((from + length) & (blocksize - 1))) {
3215 create_empty_buffers(page, blocksize, 0);
3216 } else {
3217 /*
3218 * If there are no partial blocks,
3219 * there is nothing to update,
3220 * so we can return now
3221 */
3222 return 0;
3223 }
3224 }
3225
3226 /* Find the buffer that contains "offset" */
3227 bh = page_buffers(page);
3228 pos = blocksize;
3229 while (offset >= pos) {
3230 bh = bh->b_this_page;
3231 iblock++;
3232 pos += blocksize;
3233 }
3234
3235 pos = offset;
3236 while (pos < offset + length) {
3237 unsigned int end_of_block, range_to_discard;
3238
3239 err = 0;
3240
3241 /* The length of space left to zero and unmap */
3242 range_to_discard = offset + length - pos;
3243
3244 /* The length of space until the end of the block */
3245 end_of_block = blocksize - (pos & (blocksize-1));
3246
3247 /*
3248 * Do not unmap or zero past end of block
3249 * for this buffer head
3250 */
3251 if (range_to_discard > end_of_block)
3252 range_to_discard = end_of_block;
3253
3254
3255 /*
3256 * Skip this buffer head if we are only zeroing unampped
3257 * regions of the page
3258 */
3259 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3260 buffer_mapped(bh))
3261 goto next;
3262
3263 /* If the range is block aligned, unmap */
3264 if (range_to_discard == blocksize) {
3265 clear_buffer_dirty(bh);
3266 bh->b_bdev = NULL;
3267 clear_buffer_mapped(bh);
3268 clear_buffer_req(bh);
3269 clear_buffer_new(bh);
3270 clear_buffer_delay(bh);
3271 clear_buffer_unwritten(bh);
3272 clear_buffer_uptodate(bh);
3273 zero_user(page, pos, range_to_discard);
3274 BUFFER_TRACE(bh, "Buffer discarded");
3275 goto next;
3276 }
3277
3278 /*
3279 * If this block is not completely contained in the range
3280 * to be discarded, then it is not going to be released. Because
3281 * we need to keep this block, we need to make sure this part
3282 * of the page is uptodate before we modify it by writeing
3283 * partial zeros on it.
3284 */
3285 if (!buffer_mapped(bh)) {
3286 /*
3287 * Buffer head must be mapped before we can read
3288 * from the block
3289 */
3290 BUFFER_TRACE(bh, "unmapped");
3291 ext4_get_block(inode, iblock, bh, 0);
3292 /* unmapped? It's a hole - nothing to do */
3293 if (!buffer_mapped(bh)) {
3294 BUFFER_TRACE(bh, "still unmapped");
3295 goto next;
3296 }
3297 }
3298
3299 /* Ok, it's mapped. Make sure it's up-to-date */
3300 if (PageUptodate(page))
3301 set_buffer_uptodate(bh);
3302
3303 if (!buffer_uptodate(bh)) {
3304 err = -EIO;
3305 ll_rw_block(READ, 1, &bh);
3306 wait_on_buffer(bh);
3307 /* Uhhuh. Read error. Complain and punt.*/
3308 if (!buffer_uptodate(bh))
3309 goto next;
3310 }
3311
3312 if (ext4_should_journal_data(inode)) {
3313 BUFFER_TRACE(bh, "get write access");
3314 err = ext4_journal_get_write_access(handle, bh);
3315 if (err)
3316 goto next;
3317 }
3318
3319 zero_user(page, pos, range_to_discard);
3320
3321 err = 0;
3322 if (ext4_should_journal_data(inode)) {
3323 err = ext4_handle_dirty_metadata(handle, inode, bh);
3324 } else
3325 mark_buffer_dirty(bh);
3326
3327 BUFFER_TRACE(bh, "Partial buffer zeroed");
3328next:
3329 bh = bh->b_this_page;
3330 iblock++;
3331 pos += range_to_discard;
3332 }
3333
3334 return err;
3335}
3336
2962/* 3337/*
2963 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3338 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
2964 * up to the end of the block which corresponds to `from'. 3339 * up to the end of the block which corresponds to `from'.
@@ -3001,7 +3376,7 @@ int ext4_block_zero_page_range(handle_t *handle,
3001 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3376 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3002 mapping_gfp_mask(mapping) & ~__GFP_FS); 3377 mapping_gfp_mask(mapping) & ~__GFP_FS);
3003 if (!page) 3378 if (!page)
3004 return -EINVAL; 3379 return -ENOMEM;
3005 3380
3006 blocksize = inode->i_sb->s_blocksize; 3381 blocksize = inode->i_sb->s_blocksize;
3007 max = blocksize - (offset & (blocksize - 1)); 3382 max = blocksize - (offset & (blocksize - 1));
@@ -3070,11 +3445,8 @@ int ext4_block_zero_page_range(handle_t *handle,
3070 err = 0; 3445 err = 0;
3071 if (ext4_should_journal_data(inode)) { 3446 if (ext4_should_journal_data(inode)) {
3072 err = ext4_handle_dirty_metadata(handle, inode, bh); 3447 err = ext4_handle_dirty_metadata(handle, inode, bh);
3073 } else { 3448 } else
3074 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
3075 err = ext4_jbd2_file_inode(handle, inode);
3076 mark_buffer_dirty(bh); 3449 mark_buffer_dirty(bh);
3077 }
3078 3450
3079unlock: 3451unlock:
3080 unlock_page(page); 3452 unlock_page(page);
@@ -3115,6 +3487,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3115 return -ENOTSUPP; 3487 return -ENOTSUPP;
3116 } 3488 }
3117 3489
3490 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3491 /* TODO: Add support for bigalloc file systems */
3492 return -ENOTSUPP;
3493 }
3494
3118 return ext4_ext_punch_hole(file, offset, length); 3495 return ext4_ext_punch_hole(file, offset, length);
3119} 3496}
3120 3497
@@ -3414,7 +3791,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3414 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 3791 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3415 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 3792 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3416 } 3793 }
3417 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 3794 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
3418 3795
3419 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 3796 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
3420 ei->i_dir_start_lookup = 0; 3797 ei->i_dir_start_lookup = 0;
@@ -4416,6 +4793,7 @@ retry_alloc:
4416 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 4793 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4417 unlock_page(page); 4794 unlock_page(page);
4418 ret = VM_FAULT_SIGBUS; 4795 ret = VM_FAULT_SIGBUS;
4796 ext4_journal_stop(handle);
4419 goto out; 4797 goto out;
4420 } 4798 }
4421 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 4799 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f18bfe37aff..a56796814d6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
21long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 21long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22{ 22{
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct super_block *sb = inode->i_sb;
24 struct ext4_inode_info *ei = EXT4_I(inode); 25 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 26 unsigned int flags;
26 27
@@ -173,33 +174,8 @@ setversion_out:
173 mnt_drop_write(filp->f_path.mnt); 174 mnt_drop_write(filp->f_path.mnt);
174 return err; 175 return err;
175 } 176 }
176#ifdef CONFIG_JBD2_DEBUG
177 case EXT4_IOC_WAIT_FOR_READONLY:
178 /*
179 * This is racy - by the time we're woken up and running,
180 * the superblock could be released. And the module could
181 * have been unloaded. So sue me.
182 *
183 * Returns 1 if it slept, else zero.
184 */
185 {
186 struct super_block *sb = inode->i_sb;
187 DECLARE_WAITQUEUE(wait, current);
188 int ret = 0;
189
190 set_current_state(TASK_INTERRUPTIBLE);
191 add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
192 if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
193 schedule();
194 ret = 1;
195 }
196 remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
197 return ret;
198 }
199#endif
200 case EXT4_IOC_GROUP_EXTEND: { 177 case EXT4_IOC_GROUP_EXTEND: {
201 ext4_fsblk_t n_blocks_count; 178 ext4_fsblk_t n_blocks_count;
202 struct super_block *sb = inode->i_sb;
203 int err, err2=0; 179 int err, err2=0;
204 180
205 err = ext4_resize_begin(sb); 181 err = ext4_resize_begin(sb);
@@ -209,6 +185,13 @@ setversion_out:
209 if (get_user(n_blocks_count, (__u32 __user *)arg)) 185 if (get_user(n_blocks_count, (__u32 __user *)arg))
210 return -EFAULT; 186 return -EFAULT;
211 187
188 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
189 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
190 ext4_msg(sb, KERN_ERR,
191 "Online resizing not supported with bigalloc");
192 return -EOPNOTSUPP;
193 }
194
212 err = mnt_want_write(filp->f_path.mnt); 195 err = mnt_want_write(filp->f_path.mnt);
213 if (err) 196 if (err)
214 return err; 197 return err;
@@ -250,6 +233,13 @@ setversion_out:
250 goto mext_out; 233 goto mext_out;
251 } 234 }
252 235
236 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
237 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
238 ext4_msg(sb, KERN_ERR,
239 "Online defrag not supported with bigalloc");
240 return -EOPNOTSUPP;
241 }
242
253 err = mnt_want_write(filp->f_path.mnt); 243 err = mnt_want_write(filp->f_path.mnt);
254 if (err) 244 if (err)
255 goto mext_out; 245 goto mext_out;
@@ -270,7 +260,6 @@ mext_out:
270 260
271 case EXT4_IOC_GROUP_ADD: { 261 case EXT4_IOC_GROUP_ADD: {
272 struct ext4_new_group_data input; 262 struct ext4_new_group_data input;
273 struct super_block *sb = inode->i_sb;
274 int err, err2=0; 263 int err, err2=0;
275 264
276 err = ext4_resize_begin(sb); 265 err = ext4_resize_begin(sb);
@@ -281,6 +270,13 @@ mext_out:
281 sizeof(input))) 270 sizeof(input)))
282 return -EFAULT; 271 return -EFAULT;
283 272
273 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
274 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
275 ext4_msg(sb, KERN_ERR,
276 "Online resizing not supported with bigalloc");
277 return -EOPNOTSUPP;
278 }
279
284 err = mnt_want_write(filp->f_path.mnt); 280 err = mnt_want_write(filp->f_path.mnt);
285 if (err) 281 if (err)
286 return err; 282 return err;
@@ -337,7 +333,6 @@ mext_out:
337 333
338 case FITRIM: 334 case FITRIM:
339 { 335 {
340 struct super_block *sb = inode->i_sb;
341 struct request_queue *q = bdev_get_queue(sb->s_bdev); 336 struct request_queue *q = bdev_get_queue(sb->s_bdev);
342 struct fstrim_range range; 337 struct fstrim_range range;
343 int ret = 0; 338 int ret = 0;
@@ -348,7 +343,14 @@ mext_out:
348 if (!blk_queue_discard(q)) 343 if (!blk_queue_discard(q))
349 return -EOPNOTSUPP; 344 return -EOPNOTSUPP;
350 345
351 if (copy_from_user(&range, (struct fstrim_range *)arg, 346 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
347 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
348 ext4_msg(sb, KERN_ERR,
349 "FITRIM not supported with bigalloc");
350 return -EOPNOTSUPP;
351 }
352
353 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
352 sizeof(range))) 354 sizeof(range)))
353 return -EFAULT; 355 return -EFAULT;
354 356
@@ -358,7 +360,7 @@ mext_out:
358 if (ret < 0) 360 if (ret < 0)
359 return ret; 361 return ret;
360 362
361 if (copy_to_user((struct fstrim_range *)arg, &range, 363 if (copy_to_user((struct fstrim_range __user *)arg, &range,
362 sizeof(range))) 364 sizeof(range)))
363 return -EFAULT; 365 return -EFAULT;
364 366
@@ -396,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
396 case EXT4_IOC32_SETVERSION_OLD: 398 case EXT4_IOC32_SETVERSION_OLD:
397 cmd = EXT4_IOC_SETVERSION_OLD; 399 cmd = EXT4_IOC_SETVERSION_OLD;
398 break; 400 break;
399#ifdef CONFIG_JBD2_DEBUG
400 case EXT4_IOC32_WAIT_FOR_READONLY:
401 cmd = EXT4_IOC_WAIT_FOR_READONLY;
402 break;
403#endif
404 case EXT4_IOC32_GETRSVSZ: 401 case EXT4_IOC32_GETRSVSZ:
405 cmd = EXT4_IOC_GETRSVSZ; 402 cmd = EXT4_IOC_GETRSVSZ;
406 break; 403 break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415..e2d8be8f28b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
70 * 70 *
71 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
72 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
73 * pa_len -> length for this prealloc space 73 * pa_len -> length for this prealloc space (in clusters)
74 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space (in clusters)
75 * 75 *
76 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc 77 * block. If only the logical file block falls within the range of prealloc
@@ -126,7 +126,8 @@
126 * list. In case of inode preallocation we follow a list of heuristics 126 * list. In case of inode preallocation we follow a list of heuristics
127 * based on file size. This can be found in ext4_mb_normalize_request. If 127 * based on file size. This can be found in ext4_mb_normalize_request. If
128 * we are doing a group prealloc we try to normalize the request to 128 * we are doing a group prealloc we try to normalize the request to
129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 129 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
130 * dependent on the cluster size; for non-bigalloc file systems, it is
130 * 512 blocks. This can be tuned via 131 * 512 blocks. This can be tuned via
131 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 132 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132 * terms of number of blocks. If we have mounted the file system with -O 133 * terms of number of blocks. If we have mounted the file system with -O
@@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
459 ext4_fsblk_t blocknr; 460 ext4_fsblk_t blocknr;
460 461
461 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 462 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
462 blocknr += first + i; 463 blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
463 ext4_grp_locked_error(sb, e4b->bd_group, 464 ext4_grp_locked_error(sb, e4b->bd_group,
464 inode ? inode->i_ino : 0, 465 inode ? inode->i_ino : 0,
465 blocknr, 466 blocknr,
@@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
580 continue; 581 continue;
581 } 582 }
582 583
583 /* both bits in buddy2 must be 0 */ 584 /* both bits in buddy2 must be 1 */
584 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 585 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
585 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 586 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
586 587
@@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
653 ext4_grpblk_t chunk; 654 ext4_grpblk_t chunk;
654 unsigned short border; 655 unsigned short border;
655 656
656 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 657 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
657 658
658 border = 2 << sb->s_blocksize_bits; 659 border = 2 << sb->s_blocksize_bits;
659 660
@@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
705 void *buddy, void *bitmap, ext4_group_t group) 706 void *buddy, void *bitmap, ext4_group_t group)
706{ 707{
707 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 708 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
708 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); 709 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
709 ext4_grpblk_t i = 0; 710 ext4_grpblk_t i = 0;
710 ext4_grpblk_t first; 711 ext4_grpblk_t first;
711 ext4_grpblk_t len; 712 ext4_grpblk_t len;
@@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
734 735
735 if (free != grp->bb_free) { 736 if (free != grp->bb_free) {
736 ext4_grp_locked_error(sb, group, 0, 0, 737 ext4_grp_locked_error(sb, group, 0, 0,
737 "%u blocks in bitmap, %u in gd", 738 "%u clusters in bitmap, %u in gd",
738 free, grp->bb_free); 739 free, grp->bb_free);
739 /* 740 /*
740 * If we intent to continue, we consider group descritor 741 * If we intent to continue, we consider group descritor
@@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1339 ext4_fsblk_t blocknr; 1340 ext4_fsblk_t blocknr;
1340 1341
1341 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1342 blocknr += block; 1343 blocknr += EXT4_C2B(EXT4_SB(sb), block);
1343 ext4_grp_locked_error(sb, e4b->bd_group, 1344 ext4_grp_locked_error(sb, e4b->bd_group,
1344 inode ? inode->i_ino : 0, 1345 inode ? inode->i_ino : 0,
1345 blocknr, 1346 blocknr,
@@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1390{ 1391{
1391 int next = block; 1392 int next = block;
1392 int max; 1393 int max;
1393 int ord;
1394 void *buddy; 1394 void *buddy;
1395 1395
1396 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1396 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1433 break; 1433 break;
1434 1434
1435 ord = mb_find_order_for_block(e4b, next); 1435 order = mb_find_order_for_block(e4b, next);
1436 1436
1437 order = ord;
1438 block = next >> order; 1437 block = next >> order;
1439 ex->fe_len += 1 << order; 1438 ex->fe_len += 1 << order;
1440 } 1439 }
@@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1624 struct ext4_free_extent *gex = &ac->ac_g_ex; 1623 struct ext4_free_extent *gex = &ac->ac_g_ex;
1625 1624
1626 BUG_ON(ex->fe_len <= 0); 1625 BUG_ON(ex->fe_len <= 0);
1627 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1626 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1628 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1627 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1629 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1630 1629
1631 ac->ac_found++; 1630 ac->ac_found++;
@@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1823 1822
1824 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1823 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1825 i = mb_find_next_zero_bit(bitmap, 1824 i = mb_find_next_zero_bit(bitmap,
1826 EXT4_BLOCKS_PER_GROUP(sb), i); 1825 EXT4_CLUSTERS_PER_GROUP(sb), i);
1827 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1826 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1828 /* 1827 /*
1829 * IF we have corrupt bitmap, we won't find any 1828 * IF we have corrupt bitmap, we won't find any
1830 * free blocks even though group info says we 1829 * free blocks even though group info says we
1831 * we have free blocks 1830 * we have free blocks
1832 */ 1831 */
1833 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1832 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1834 "%d free blocks as per " 1833 "%d free clusters as per "
1835 "group info. But bitmap says 0", 1834 "group info. But bitmap says 0",
1836 free); 1835 free);
1837 break; 1836 break;
@@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1841 BUG_ON(ex.fe_len <= 0); 1840 BUG_ON(ex.fe_len <= 0);
1842 if (free < ex.fe_len) { 1841 if (free < ex.fe_len) {
1843 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1842 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1844 "%d free blocks as per " 1843 "%d free clusters as per "
1845 "group info. But got %d blocks", 1844 "group info. But got %d blocks",
1846 free, ex.fe_len); 1845 free, ex.fe_len);
1847 /* 1846 /*
@@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1887 do_div(a, sbi->s_stripe); 1886 do_div(a, sbi->s_stripe);
1888 i = (a * sbi->s_stripe) - first_group_block; 1887 i = (a * sbi->s_stripe) - first_group_block;
1889 1888
1890 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1889 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1891 if (!mb_test_bit(i, bitmap)) { 1890 if (!mb_test_bit(i, bitmap)) {
1892 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1891 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1893 if (max >= sbi->s_stripe) { 1892 if (max >= sbi->s_stripe) {
@@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2252 */ 2251 */
2253 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2252 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2254 meta_group_info[i]->bb_free = 2253 meta_group_info[i]->bb_free =
2255 ext4_free_blocks_after_init(sb, group, desc); 2254 ext4_free_clusters_after_init(sb, group, desc);
2256 } else { 2255 } else {
2257 meta_group_info[i]->bb_free = 2256 meta_group_info[i]->bb_free =
2258 ext4_free_blks_count(sb, desc); 2257 ext4_free_group_clusters(sb, desc);
2259 } 2258 }
2260 2259
2261 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2260 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2473 sbi->s_mb_stats = MB_DEFAULT_STATS; 2472 sbi->s_mb_stats = MB_DEFAULT_STATS;
2474 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2473 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2475 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2474 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2476 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2475 /*
2476 * The default group preallocation is 512, which for 4k block
2477 * sizes translates to 2 megabytes. However for bigalloc file
2478 * systems, this is probably too big (i.e, if the cluster size
2479 * is 1 megabyte, then group preallocation size becomes half a
2480 * gigabyte!). As a default, we will keep a two megabyte
2481 * group pralloc size for cluster sizes up to 64k, and after
2482 * that, we will force a minimum group preallocation size of
2483 * 32 clusters. This translates to 8 megs when the cluster
2484 * size is 256k, and 32 megs when the cluster size is 1 meg,
2485 * which seems reasonable as a default.
2486 */
2487 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2488 sbi->s_cluster_bits, 32);
2477 /* 2489 /*
2478 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 2490 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2479 * to the lowest multiple of s_stripe which is bigger than 2491 * to the lowest multiple of s_stripe which is bigger than
@@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2490 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2502 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2491 if (sbi->s_locality_groups == NULL) { 2503 if (sbi->s_locality_groups == NULL) {
2492 ret = -ENOMEM; 2504 ret = -ENOMEM;
2493 goto out; 2505 goto out_free_groupinfo_slab;
2494 } 2506 }
2495 for_each_possible_cpu(i) { 2507 for_each_possible_cpu(i) {
2496 struct ext4_locality_group *lg; 2508 struct ext4_locality_group *lg;
@@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2503 2515
2504 /* init file for buddy data */ 2516 /* init file for buddy data */
2505 ret = ext4_mb_init_backend(sb); 2517 ret = ext4_mb_init_backend(sb);
2506 if (ret != 0) { 2518 if (ret != 0)
2507 goto out; 2519 goto out_free_locality_groups;
2508 }
2509 2520
2510 if (sbi->s_proc) 2521 if (sbi->s_proc)
2511 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2513 2524
2514 if (sbi->s_journal) 2525 if (sbi->s_journal)
2515 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2527
2528 return 0;
2529
2530out_free_locality_groups:
2531 free_percpu(sbi->s_locality_groups);
2532 sbi->s_locality_groups = NULL;
2533out_free_groupinfo_slab:
2534 ext4_groupinfo_destroy_slabs();
2516out: 2535out:
2517 if (ret) { 2536 kfree(sbi->s_mb_offsets);
2518 kfree(sbi->s_mb_offsets); 2537 sbi->s_mb_offsets = NULL;
2519 kfree(sbi->s_mb_maxs); 2538 kfree(sbi->s_mb_maxs);
2520 } 2539 sbi->s_mb_maxs = NULL;
2521 return ret; 2540 return ret;
2522} 2541}
2523 2542
@@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb)
2602} 2621}
2603 2622
2604static inline int ext4_issue_discard(struct super_block *sb, 2623static inline int ext4_issue_discard(struct super_block *sb,
2605 ext4_group_t block_group, ext4_grpblk_t block, int count) 2624 ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2606{ 2625{
2607 ext4_fsblk_t discard_block; 2626 ext4_fsblk_t discard_block;
2608 2627
2609 discard_block = block + ext4_group_first_block_no(sb, block_group); 2628 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2629 ext4_group_first_block_no(sb, block_group));
2630 count = EXT4_C2B(EXT4_SB(sb), count);
2610 trace_ext4_discard_blocks(sb, 2631 trace_ext4_discard_blocks(sb,
2611 (unsigned long long) discard_block, count); 2632 (unsigned long long) discard_block, count);
2612 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2633 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2633 2654
2634 if (test_opt(sb, DISCARD)) 2655 if (test_opt(sb, DISCARD))
2635 ext4_issue_discard(sb, entry->group, 2656 ext4_issue_discard(sb, entry->group,
2636 entry->start_blk, entry->count); 2657 entry->start_cluster, entry->count);
2637 2658
2638 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2639 /* we expect to find existing buddy because it's pinned */ 2660 /* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2646 ext4_lock_group(sb, entry->group); 2667 ext4_lock_group(sb, entry->group);
2647 /* Take it out of per group rb tree */ 2668 /* Take it out of per group rb tree */
2648 rb_erase(&entry->node, &(db->bb_free_root)); 2669 rb_erase(&entry->node, &(db->bb_free_root));
2649 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
2650 2671
2651 /* 2672 /*
2652 * Clear the trimmed flag for the group so that the next 2673 * Clear the trimmed flag for the group so that the next
@@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void)
2752 */ 2773 */
2753static noinline_for_stack int 2774static noinline_for_stack int
2754ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2775ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2755 handle_t *handle, unsigned int reserv_blks) 2776 handle_t *handle, unsigned int reserv_clstrs)
2756{ 2777{
2757 struct buffer_head *bitmap_bh = NULL; 2778 struct buffer_head *bitmap_bh = NULL;
2758 struct ext4_group_desc *gdp; 2779 struct ext4_group_desc *gdp;
@@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2783 goto out_err; 2804 goto out_err;
2784 2805
2785 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2806 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2786 ext4_free_blks_count(sb, gdp)); 2807 ext4_free_group_clusters(sb, gdp));
2787 2808
2788 err = ext4_journal_get_write_access(handle, gdp_bh); 2809 err = ext4_journal_get_write_access(handle, gdp_bh);
2789 if (err) 2810 if (err)
@@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2791 2812
2792 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 2813 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2793 2814
2794 len = ac->ac_b_ex.fe_len; 2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2795 if (!ext4_data_block_valid(sbi, block, len)) { 2816 if (!ext4_data_block_valid(sbi, block, len)) {
2796 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2797 "fs metadata\n", block, block+len); 2818 "fs metadata\n", block, block+len);
@@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2823 ac->ac_b_ex.fe_len); 2844 ac->ac_b_ex.fe_len);
2824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2845 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2846 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2826 ext4_free_blks_set(sb, gdp, 2847 ext4_free_group_clusters_set(sb, gdp,
2827 ext4_free_blocks_after_init(sb, 2848 ext4_free_clusters_after_init(sb,
2828 ac->ac_b_ex.fe_group, gdp)); 2849 ac->ac_b_ex.fe_group, gdp));
2829 } 2850 }
2830 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 2851 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2831 ext4_free_blks_set(sb, gdp, len); 2852 ext4_free_group_clusters_set(sb, gdp, len);
2832 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2853 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2833 2854
2834 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2855 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2835 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2856 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2836 /* 2857 /*
2837 * Now reduce the dirty block count also. Should not go negative 2858 * Now reduce the dirty block count also. Should not go negative
2838 */ 2859 */
2839 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2860 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2840 /* release all the reserved blocks if non delalloc */ 2861 /* release all the reserved blocks if non delalloc */
2841 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 2862 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2863 reserv_clstrs);
2842 2864
2843 if (sbi->s_log_groups_per_flex) { 2865 if (sbi->s_log_groups_per_flex) {
2844 ext4_group_t flex_group = ext4_flex_group(sbi, 2866 ext4_group_t flex_group = ext4_flex_group(sbi,
2845 ac->ac_b_ex.fe_group); 2867 ac->ac_b_ex.fe_group);
2846 atomic_sub(ac->ac_b_ex.fe_len, 2868 atomic_sub(ac->ac_b_ex.fe_len,
2847 &sbi->s_flex_groups[flex_group].free_blocks); 2869 &sbi->s_flex_groups[flex_group].free_clusters);
2848 } 2870 }
2849 2871
2850 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2872 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2886,6 +2908,7 @@ static noinline_for_stack void
2886ext4_mb_normalize_request(struct ext4_allocation_context *ac, 2908ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2887 struct ext4_allocation_request *ar) 2909 struct ext4_allocation_request *ar)
2888{ 2910{
2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2889 int bsbits, max; 2912 int bsbits, max;
2890 ext4_lblk_t end; 2913 ext4_lblk_t end;
2891 loff_t size, orig_size, start_off; 2914 loff_t size, orig_size, start_off;
@@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2916 2939
2917 /* first, let's learn actual file size 2940 /* first, let's learn actual file size
2918 * given current request is allocated */ 2941 * given current request is allocated */
2919 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 2942 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
2920 size = size << bsbits; 2943 size = size << bsbits;
2921 if (size < i_size_read(ac->ac_inode)) 2944 if (size < i_size_read(ac->ac_inode))
2922 size = i_size_read(ac->ac_inode); 2945 size = i_size_read(ac->ac_inode);
@@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2988 continue; 3011 continue;
2989 } 3012 }
2990 3013
2991 pa_end = pa->pa_lstart + pa->pa_len; 3014 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3015 pa->pa_len);
2992 3016
2993 /* PA must not overlap original request */ 3017 /* PA must not overlap original request */
2994 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3018 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3018 rcu_read_lock(); 3042 rcu_read_lock();
3019 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3043 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3020 ext4_lblk_t pa_end; 3044 ext4_lblk_t pa_end;
3045
3021 spin_lock(&pa->pa_lock); 3046 spin_lock(&pa->pa_lock);
3022 if (pa->pa_deleted == 0) { 3047 if (pa->pa_deleted == 0) {
3023 pa_end = pa->pa_lstart + pa->pa_len; 3048 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3049 pa->pa_len);
3024 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3050 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3025 } 3051 }
3026 spin_unlock(&pa->pa_lock); 3052 spin_unlock(&pa->pa_lock);
@@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3036 } 3062 }
3037 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3063 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3038 start > ac->ac_o_ex.fe_logical); 3064 start > ac->ac_o_ex.fe_logical);
3039 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3065 BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
3040 3066
3041 /* now prepare goal request */ 3067 /* now prepare goal request */
3042 3068
3043 /* XXX: is it better to align blocks WRT to logical 3069 /* XXX: is it better to align blocks WRT to logical
3044 * placement or satisfy big request as is */ 3070 * placement or satisfy big request as is */
3045 ac->ac_g_ex.fe_logical = start; 3071 ac->ac_g_ex.fe_logical = start;
3046 ac->ac_g_ex.fe_len = size; 3072 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3047 3073
3048 /* define goal start in order to merge */ 3074 /* define goal start in order to merge */
3049 if (ar->pright && (ar->lright == (start + size))) { 3075 if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3112static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3138static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3113 struct ext4_prealloc_space *pa) 3139 struct ext4_prealloc_space *pa)
3114{ 3140{
3141 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3115 ext4_fsblk_t start; 3142 ext4_fsblk_t start;
3116 ext4_fsblk_t end; 3143 ext4_fsblk_t end;
3117 int len; 3144 int len;
3118 3145
3119 /* found preallocated blocks, use them */ 3146 /* found preallocated blocks, use them */
3120 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3147 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3121 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3148 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3122 len = end - start; 3149 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3150 len = EXT4_NUM_B2C(sbi, end - start);
3123 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3151 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3124 &ac->ac_b_ex.fe_start); 3152 &ac->ac_b_ex.fe_start);
3125 ac->ac_b_ex.fe_len = len; 3153 ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3127 ac->ac_pa = pa; 3155 ac->ac_pa = pa;
3128 3156
3129 BUG_ON(start < pa->pa_pstart); 3157 BUG_ON(start < pa->pa_pstart);
3130 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3158 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3131 BUG_ON(pa->pa_free < len); 3159 BUG_ON(pa->pa_free < len);
3132 pa->pa_free -= len; 3160 pa->pa_free -= len;
3133 3161
@@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3193static noinline_for_stack int 3221static noinline_for_stack int
3194ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3222ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3195{ 3223{
3224 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3196 int order, i; 3225 int order, i;
3197 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3226 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3198 struct ext4_locality_group *lg; 3227 struct ext4_locality_group *lg;
@@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3210 /* all fields in this condition don't change, 3239 /* all fields in this condition don't change,
3211 * so we can skip locking for them */ 3240 * so we can skip locking for them */
3212 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3241 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3213 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3242 ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3243 EXT4_C2B(sbi, pa->pa_len)))
3214 continue; 3244 continue;
3215 3245
3216 /* non-extent files can't have physical blocks past 2^32 */ 3246 /* non-extent files can't have physical blocks past 2^32 */
3217 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 3247 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3218 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3248 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3249 EXT4_MAX_BLOCK_FILE_PHYS))
3219 continue; 3250 continue;
3220 3251
3221 /* found preallocated blocks, use them */ 3252 /* found preallocated blocks, use them */
@@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3291 3322
3292 while (n) { 3323 while (n) {
3293 entry = rb_entry(n, struct ext4_free_data, node); 3324 entry = rb_entry(n, struct ext4_free_data, node);
3294 ext4_set_bits(bitmap, entry->start_blk, entry->count); 3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count);
3295 n = rb_next(n); 3326 n = rb_next(n);
3296 } 3327 }
3297 return; 3328 return;
@@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3312 ext4_group_t groupnr; 3343 ext4_group_t groupnr;
3313 ext4_grpblk_t start; 3344 ext4_grpblk_t start;
3314 int preallocated = 0; 3345 int preallocated = 0;
3315 int count = 0;
3316 int len; 3346 int len;
3317 3347
3318 /* all form of preallocation discards first load group, 3348 /* all form of preallocation discards first load group,
@@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3335 BUG_ON(groupnr != group); 3365 BUG_ON(groupnr != group);
3336 ext4_set_bits(bitmap, start, len); 3366 ext4_set_bits(bitmap, start, len);
3337 preallocated += len; 3367 preallocated += len;
3338 count++;
3339 } 3368 }
3340 mb_debug(1, "prellocated %u for group %u\n", preallocated, group); 3369 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3341} 3370}
@@ -3412,6 +3441,7 @@ static noinline_for_stack int
3412ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3441ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3413{ 3442{
3414 struct super_block *sb = ac->ac_sb; 3443 struct super_block *sb = ac->ac_sb;
3444 struct ext4_sb_info *sbi = EXT4_SB(sb);
3415 struct ext4_prealloc_space *pa; 3445 struct ext4_prealloc_space *pa;
3416 struct ext4_group_info *grp; 3446 struct ext4_group_info *grp;
3417 struct ext4_inode_info *ei; 3447 struct ext4_inode_info *ei;
@@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3443 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3473 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3444 3474
3445 /* also, we should cover whole original request */ 3475 /* also, we should cover whole original request */
3446 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3476 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3447 3477
3448 /* the smallest one defines real window */ 3478 /* the smallest one defines real window */
3449 win = min(winl, wins); 3479 win = min(winl, wins);
3450 3480
3451 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3481 offs = ac->ac_o_ex.fe_logical %
3482 EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3452 if (offs && offs < win) 3483 if (offs && offs < win)
3453 win = offs; 3484 win = offs;
3454 3485
3455 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3486 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3487 EXT4_B2C(sbi, win);
3456 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3488 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3457 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3489 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3458 } 3490 }
@@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3477 trace_ext4_mb_new_inode_pa(ac, pa); 3509 trace_ext4_mb_new_inode_pa(ac, pa);
3478 3510
3479 ext4_mb_use_inode_pa(ac, pa); 3511 ext4_mb_use_inode_pa(ac, pa);
3480 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3512 atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3481 3513
3482 ei = EXT4_I(ac->ac_inode); 3514 ei = EXT4_I(ac->ac_inode);
3483 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3515 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3592 3624
3593 BUG_ON(pa->pa_deleted == 0); 3625 BUG_ON(pa->pa_deleted == 0);
3594 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3626 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3595 grp_blk_start = pa->pa_pstart - bit; 3627 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3596 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3628 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3597 end = bit + pa->pa_len; 3629 end = bit + pa->pa_len;
3598 3630
@@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3607 free += next - bit; 3639 free += next - bit;
3608 3640
3609 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 3641 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3610 trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, 3642 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3643 EXT4_C2B(sbi, bit)),
3611 next - bit); 3644 next - bit);
3612 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3645 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3613 bit = next + 1; 3646 bit = next + 1;
@@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3690 } 3723 }
3691 3724
3692 if (needed == 0) 3725 if (needed == 0)
3693 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3726 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3694 3727
3695 INIT_LIST_HEAD(&list); 3728 INIT_LIST_HEAD(&list);
3696repeat: 3729repeat:
@@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3958 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3991 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3959 return; 3992 return;
3960 3993
3961 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3994 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3962 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3995 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
3963 >> bsbits; 3996 >> bsbits;
3964 3997
@@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3969 return; 4002 return;
3970 } 4003 }
3971 4004
4005 if (sbi->s_mb_group_prealloc <= 0) {
4006 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4007 return;
4008 }
4009
3972 /* don't use group allocation for large files */ 4010 /* don't use group allocation for large files */
3973 size = max(size, isize); 4011 size = max(size, isize);
3974 if (size > sbi->s_mb_stream_request) { 4012 if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4007 len = ar->len; 4045 len = ar->len;
4008 4046
4009 /* just a dirty hack to filter too big requests */ 4047 /* just a dirty hack to filter too big requests */
4010 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4048 if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
4011 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4049 len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
4012 4050
4013 /* start searching from the goal */ 4051 /* start searching from the goal */
4014 goal = ar->goal; 4052 goal = ar->goal;
@@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4019 4057
4020 /* set up allocation goals */ 4058 /* set up allocation goals */
4021 memset(ac, 0, sizeof(struct ext4_allocation_context)); 4059 memset(ac, 0, sizeof(struct ext4_allocation_context));
4022 ac->ac_b_ex.fe_logical = ar->logical; 4060 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
4023 ac->ac_status = AC_STATUS_CONTINUE; 4061 ac->ac_status = AC_STATUS_CONTINUE;
4024 ac->ac_sb = sb; 4062 ac->ac_sb = sb;
4025 ac->ac_inode = ar->inode; 4063 ac->ac_inode = ar->inode;
4026 ac->ac_o_ex.fe_logical = ar->logical; 4064 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4027 ac->ac_o_ex.fe_group = group; 4065 ac->ac_o_ex.fe_group = group;
4028 ac->ac_o_ex.fe_start = block; 4066 ac->ac_o_ex.fe_start = block;
4029 ac->ac_o_ex.fe_len = len; 4067 ac->ac_o_ex.fe_len = len;
4030 ac->ac_g_ex.fe_logical = ar->logical; 4068 ac->ac_g_ex = ac->ac_o_ex;
4031 ac->ac_g_ex.fe_group = group;
4032 ac->ac_g_ex.fe_start = block;
4033 ac->ac_g_ex.fe_len = len;
4034 ac->ac_flags = ar->flags; 4069 ac->ac_flags = ar->flags;
4035 4070
4036 /* we have to define context: we'll we work with a file or 4071 /* we have to define context: we'll we work with a file or
@@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4182 */ 4217 */
4183static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4218static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4184{ 4219{
4220 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4185 struct ext4_prealloc_space *pa = ac->ac_pa; 4221 struct ext4_prealloc_space *pa = ac->ac_pa;
4186 if (pa) { 4222 if (pa) {
4187 if (pa->pa_type == MB_GROUP_PA) { 4223 if (pa->pa_type == MB_GROUP_PA) {
4188 /* see comment in ext4_mb_use_group_pa() */ 4224 /* see comment in ext4_mb_use_group_pa() */
4189 spin_lock(&pa->pa_lock); 4225 spin_lock(&pa->pa_lock);
4190 pa->pa_pstart += ac->ac_b_ex.fe_len; 4226 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4191 pa->pa_lstart += ac->ac_b_ex.fe_len; 4227 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4192 pa->pa_free -= ac->ac_b_ex.fe_len; 4228 pa->pa_free -= ac->ac_b_ex.fe_len;
4193 pa->pa_len -= ac->ac_b_ex.fe_len; 4229 pa->pa_len -= ac->ac_b_ex.fe_len;
4194 spin_unlock(&pa->pa_lock); 4230 spin_unlock(&pa->pa_lock);
@@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4249 struct super_block *sb; 4285 struct super_block *sb;
4250 ext4_fsblk_t block = 0; 4286 ext4_fsblk_t block = 0;
4251 unsigned int inquota = 0; 4287 unsigned int inquota = 0;
4252 unsigned int reserv_blks = 0; 4288 unsigned int reserv_clstrs = 0;
4253 4289
4254 sb = ar->inode->i_sb; 4290 sb = ar->inode->i_sb;
4255 sbi = EXT4_SB(sb); 4291 sbi = EXT4_SB(sb);
4256 4292
4257 trace_ext4_request_blocks(ar); 4293 trace_ext4_request_blocks(ar);
4258 4294
4295 /* Allow to use superuser reservation for quota file */
4296 if (IS_NOQUOTA(ar->inode))
4297 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4298
4259 /* 4299 /*
4260 * For delayed allocation, we could skip the ENOSPC and 4300 * For delayed allocation, we could skip the ENOSPC and
4261 * EDQUOT check, as blocks and quotas have been already 4301 * EDQUOT check, as blocks and quotas have been already
@@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4269 * and verify allocation doesn't exceed the quota limits. 4309 * and verify allocation doesn't exceed the quota limits.
4270 */ 4310 */
4271 while (ar->len && 4311 while (ar->len &&
4272 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { 4312 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4273 4313
4274 /* let others to free the space */ 4314 /* let others to free the space */
4275 yield(); 4315 yield();
@@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4279 *errp = -ENOSPC; 4319 *errp = -ENOSPC;
4280 return 0; 4320 return 0;
4281 } 4321 }
4282 reserv_blks = ar->len; 4322 reserv_clstrs = ar->len;
4283 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 4323 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4284 dquot_alloc_block_nofail(ar->inode, ar->len); 4324 dquot_alloc_block_nofail(ar->inode,
4325 EXT4_C2B(sbi, ar->len));
4285 } else { 4326 } else {
4286 while (ar->len && 4327 while (ar->len &&
4287 dquot_alloc_block(ar->inode, ar->len)) { 4328 dquot_alloc_block(ar->inode,
4329 EXT4_C2B(sbi, ar->len))) {
4288 4330
4289 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4331 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4290 ar->len--; 4332 ar->len--;
@@ -4328,7 +4370,7 @@ repeat:
4328 ext4_mb_new_preallocation(ac); 4370 ext4_mb_new_preallocation(ac);
4329 } 4371 }
4330 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4372 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4331 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4373 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4332 if (*errp == -EAGAIN) { 4374 if (*errp == -EAGAIN) {
4333 /* 4375 /*
4334 * drop the reference that we took 4376 * drop the reference that we took
@@ -4364,13 +4406,13 @@ out:
4364 if (ac) 4406 if (ac)
4365 kmem_cache_free(ext4_ac_cachep, ac); 4407 kmem_cache_free(ext4_ac_cachep, ac);
4366 if (inquota && ar->len < inquota) 4408 if (inquota && ar->len < inquota)
4367 dquot_free_block(ar->inode, inquota - ar->len); 4409 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4368 if (!ar->len) { 4410 if (!ar->len) {
4369 if (!ext4_test_inode_state(ar->inode, 4411 if (!ext4_test_inode_state(ar->inode,
4370 EXT4_STATE_DELALLOC_RESERVED)) 4412 EXT4_STATE_DELALLOC_RESERVED))
4371 /* release all the reserved blocks if non delalloc */ 4413 /* release all the reserved blocks if non delalloc */
4372 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4414 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4373 reserv_blks); 4415 reserv_clstrs);
4374 } 4416 }
4375 4417
4376 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4418 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1,
4388{ 4430{
4389 if ((entry1->t_tid == entry2->t_tid) && 4431 if ((entry1->t_tid == entry2->t_tid) &&
4390 (entry1->group == entry2->group) && 4432 (entry1->group == entry2->group) &&
4391 ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
4392 return 1; 4434 return 1;
4393 return 0; 4435 return 0;
4394} 4436}
@@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4398 struct ext4_free_data *new_entry) 4440 struct ext4_free_data *new_entry)
4399{ 4441{
4400 ext4_group_t group = e4b->bd_group; 4442 ext4_group_t group = e4b->bd_group;
4401 ext4_grpblk_t block; 4443 ext4_grpblk_t cluster;
4402 struct ext4_free_data *entry; 4444 struct ext4_free_data *entry;
4403 struct ext4_group_info *db = e4b->bd_info; 4445 struct ext4_group_info *db = e4b->bd_info;
4404 struct super_block *sb = e4b->bd_sb; 4446 struct super_block *sb = e4b->bd_sb;
@@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4411 BUG_ON(e4b->bd_buddy_page == NULL); 4453 BUG_ON(e4b->bd_buddy_page == NULL);
4412 4454
4413 new_node = &new_entry->node; 4455 new_node = &new_entry->node;
4414 block = new_entry->start_blk; 4456 cluster = new_entry->start_cluster;
4415 4457
4416 if (!*n) { 4458 if (!*n) {
4417 /* first free block exent. We need to 4459 /* first free block exent. We need to
@@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4425 while (*n) { 4467 while (*n) {
4426 parent = *n; 4468 parent = *n;
4427 entry = rb_entry(parent, struct ext4_free_data, node); 4469 entry = rb_entry(parent, struct ext4_free_data, node);
4428 if (block < entry->start_blk) 4470 if (cluster < entry->start_cluster)
4429 n = &(*n)->rb_left; 4471 n = &(*n)->rb_left;
4430 else if (block >= (entry->start_blk + entry->count)) 4472 else if (cluster >= (entry->start_cluster + entry->count))
4431 n = &(*n)->rb_right; 4473 n = &(*n)->rb_right;
4432 else { 4474 else {
4433 ext4_grp_locked_error(sb, group, 0, 4475 ext4_grp_locked_error(sb, group, 0,
4434 ext4_group_first_block_no(sb, group) + block, 4476 ext4_group_first_block_no(sb, group) +
4477 EXT4_C2B(sbi, cluster),
4435 "Block already on to-be-freed list"); 4478 "Block already on to-be-freed list");
4436 return 0; 4479 return 0;
4437 } 4480 }
@@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4445 if (node) { 4488 if (node) {
4446 entry = rb_entry(node, struct ext4_free_data, node); 4489 entry = rb_entry(node, struct ext4_free_data, node);
4447 if (can_merge(entry, new_entry)) { 4490 if (can_merge(entry, new_entry)) {
4448 new_entry->start_blk = entry->start_blk; 4491 new_entry->start_cluster = entry->start_cluster;
4449 new_entry->count += entry->count; 4492 new_entry->count += entry->count;
4450 rb_erase(node, &(db->bb_free_root)); 4493 rb_erase(node, &(db->bb_free_root));
4451 spin_lock(&sbi->s_md_lock); 4494 spin_lock(&sbi->s_md_lock);
@@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4496 ext4_group_t block_group; 4539 ext4_group_t block_group;
4497 struct ext4_sb_info *sbi; 4540 struct ext4_sb_info *sbi;
4498 struct ext4_buddy e4b; 4541 struct ext4_buddy e4b;
4542 unsigned int count_clusters;
4499 int err = 0; 4543 int err = 0;
4500 int ret; 4544 int ret;
4501 4545
@@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4544 if (!ext4_should_writeback_data(inode)) 4588 if (!ext4_should_writeback_data(inode))
4545 flags |= EXT4_FREE_BLOCKS_METADATA; 4589 flags |= EXT4_FREE_BLOCKS_METADATA;
4546 4590
4591 /*
4592 * If the extent to be freed does not begin on a cluster
4593 * boundary, we need to deal with partial clusters at the
4594 * beginning and end of the extent. Normally we will free
4595 * blocks at the beginning or the end unless we are explicitly
4596 * requested to avoid doing so.
4597 */
4598 overflow = block & (sbi->s_cluster_ratio - 1);
4599 if (overflow) {
4600 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4601 overflow = sbi->s_cluster_ratio - overflow;
4602 block += overflow;
4603 if (count > overflow)
4604 count -= overflow;
4605 else
4606 return;
4607 } else {
4608 block -= overflow;
4609 count += overflow;
4610 }
4611 }
4612 overflow = count & (sbi->s_cluster_ratio - 1);
4613 if (overflow) {
4614 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4615 if (count > overflow)
4616 count -= overflow;
4617 else
4618 return;
4619 } else
4620 count += sbi->s_cluster_ratio - overflow;
4621 }
4622
4547do_more: 4623do_more:
4548 overflow = 0; 4624 overflow = 0;
4549 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4625 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4552,10 +4628,12 @@ do_more:
4552 * Check to see if we are freeing blocks across a group 4628 * Check to see if we are freeing blocks across a group
4553 * boundary. 4629 * boundary.
4554 */ 4630 */
4555 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4631 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4556 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4632 overflow = EXT4_C2B(sbi, bit) + count -
4633 EXT4_BLOCKS_PER_GROUP(sb);
4557 count -= overflow; 4634 count -= overflow;
4558 } 4635 }
4636 count_clusters = EXT4_B2C(sbi, count);
4559 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4637 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4560 if (!bitmap_bh) { 4638 if (!bitmap_bh) {
4561 err = -EIO; 4639 err = -EIO;
@@ -4570,9 +4648,9 @@ do_more:
4570 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4648 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4571 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4649 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4572 in_range(block, ext4_inode_table(sb, gdp), 4650 in_range(block, ext4_inode_table(sb, gdp),
4573 EXT4_SB(sb)->s_itb_per_group) || 4651 EXT4_SB(sb)->s_itb_per_group) ||
4574 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4652 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4575 EXT4_SB(sb)->s_itb_per_group)) { 4653 EXT4_SB(sb)->s_itb_per_group)) {
4576 4654
4577 ext4_error(sb, "Freeing blocks in system zone - " 4655 ext4_error(sb, "Freeing blocks in system zone - "
4578 "Block = %llu, count = %lu", block, count); 4656 "Block = %llu, count = %lu", block, count);
@@ -4597,11 +4675,11 @@ do_more:
4597#ifdef AGGRESSIVE_CHECK 4675#ifdef AGGRESSIVE_CHECK
4598 { 4676 {
4599 int i; 4677 int i;
4600 for (i = 0; i < count; i++) 4678 for (i = 0; i < count_clusters; i++)
4601 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4679 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4602 } 4680 }
4603#endif 4681#endif
4604 trace_ext4_mballoc_free(sb, inode, block_group, bit, count); 4682 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4605 4683
4606 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4684 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4607 if (err) 4685 if (err)
@@ -4618,13 +4696,13 @@ do_more:
4618 err = -ENOMEM; 4696 err = -ENOMEM;
4619 goto error_return; 4697 goto error_return;
4620 } 4698 }
4621 new_entry->start_blk = bit; 4699 new_entry->start_cluster = bit;
4622 new_entry->group = block_group; 4700 new_entry->group = block_group;
4623 new_entry->count = count; 4701 new_entry->count = count_clusters;
4624 new_entry->t_tid = handle->h_transaction->t_tid; 4702 new_entry->t_tid = handle->h_transaction->t_tid;
4625 4703
4626 ext4_lock_group(sb, block_group); 4704 ext4_lock_group(sb, block_group);
4627 mb_clear_bits(bitmap_bh->b_data, bit, count); 4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4628 ext4_mb_free_metadata(handle, &e4b, new_entry); 4706 ext4_mb_free_metadata(handle, &e4b, new_entry);
4629 } else { 4707 } else {
4630 /* need to update group_info->bb_free and bitmap 4708 /* need to update group_info->bb_free and bitmap
@@ -4632,25 +4710,29 @@ do_more:
4632 * them with group lock_held 4710 * them with group lock_held
4633 */ 4711 */
4634 ext4_lock_group(sb, block_group); 4712 ext4_lock_group(sb, block_group);
4635 mb_clear_bits(bitmap_bh->b_data, bit, count); 4713 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4636 mb_free_blocks(inode, &e4b, bit, count); 4714 mb_free_blocks(inode, &e4b, bit, count_clusters);
4637 } 4715 }
4638 4716
4639 ret = ext4_free_blks_count(sb, gdp) + count; 4717 ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4640 ext4_free_blks_set(sb, gdp, ret); 4718 ext4_free_group_clusters_set(sb, gdp, ret);
4641 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4719 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4642 ext4_unlock_group(sb, block_group); 4720 ext4_unlock_group(sb, block_group);
4643 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4721 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4644 4722
4645 if (sbi->s_log_groups_per_flex) { 4723 if (sbi->s_log_groups_per_flex) {
4646 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4724 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4647 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4725 atomic_add(count_clusters,
4726 &sbi->s_flex_groups[flex_group].free_clusters);
4648 } 4727 }
4649 4728
4650 ext4_mb_unload_buddy(&e4b); 4729 ext4_mb_unload_buddy(&e4b);
4651 4730
4652 freed += count; 4731 freed += count;
4653 4732
4733 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4734 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4735
4654 /* We dirtied the bitmap block */ 4736 /* We dirtied the bitmap block */
4655 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4737 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4656 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4738 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4669,8 +4751,6 @@ do_more:
4669 } 4751 }
4670 ext4_mark_super_dirty(sb); 4752 ext4_mark_super_dirty(sb);
4671error_return: 4753error_return:
4672 if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4673 dquot_free_block(inode, freed);
4674 brelse(bitmap_bh); 4754 brelse(bitmap_bh);
4675 ext4_std_error(sb, err); 4755 ext4_std_error(sb, err);
4676 return; 4756 return;
@@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4778 ext4_lock_group(sb, block_group); 4858 ext4_lock_group(sb, block_group);
4779 mb_clear_bits(bitmap_bh->b_data, bit, count); 4859 mb_clear_bits(bitmap_bh->b_data, bit, count);
4780 mb_free_blocks(NULL, &e4b, bit, count); 4860 mb_free_blocks(NULL, &e4b, bit, count);
4781 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); 4861 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4782 ext4_free_blks_set(sb, desc, blk_free_count); 4862 ext4_free_group_clusters_set(sb, desc, blk_free_count);
4783 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 4863 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4784 ext4_unlock_group(sb, block_group); 4864 ext4_unlock_group(sb, block_group);
4785 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); 4865 percpu_counter_add(&sbi->s_freeclusters_counter,
4866 EXT4_B2C(sbi, blocks_freed));
4786 4867
4787 if (sbi->s_log_groups_per_flex) { 4868 if (sbi->s_log_groups_per_flex) {
4788 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4869 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4789 atomic_add(blocks_freed, 4870 atomic_add(EXT4_B2C(sbi, blocks_freed),
4790 &sbi->s_flex_groups[flex_group].free_blocks); 4871 &sbi->s_flex_groups[flex_group].free_clusters);
4791 } 4872 }
4792 4873
4793 ext4_mb_unload_buddy(&e4b); 4874 ext4_mb_unload_buddy(&e4b);
@@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4948 struct ext4_group_info *grp; 5029 struct ext4_group_info *grp;
4949 ext4_group_t first_group, last_group; 5030 ext4_group_t first_group, last_group;
4950 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4951 ext4_grpblk_t cnt = 0, first_block, last_block; 5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
4952 uint64_t start, len, minlen, trimmed = 0; 5033 uint64_t start, len, minlen, trimmed = 0;
4953 ext4_fsblk_t first_data_blk = 5034 ext4_fsblk_t first_data_blk =
4954 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4958 len = range->len >> sb->s_blocksize_bits; 5039 len = range->len >> sb->s_blocksize_bits;
4959 minlen = range->minlen >> sb->s_blocksize_bits; 5040 minlen = range->minlen >> sb->s_blocksize_bits;
4960 5041
4961 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
4962 return -EINVAL; 5043 return -EINVAL;
4963 if (start + len <= first_data_blk) 5044 if (start + len <= first_data_blk)
4964 goto out; 5045 goto out;
@@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4969 5050
4970 /* Determine first and last group to examine based on start and len */ 5051 /* Determine first and last group to examine based on start and len */
4971 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4972 &first_group, &first_block); 5053 &first_group, &first_cluster);
4973 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4974 &last_group, &last_block); 5055 &last_group, &last_cluster);
4975 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; 5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4976 last_block = EXT4_BLOCKS_PER_GROUP(sb); 5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
4977 5058
4978 if (first_group > last_group) 5059 if (first_group > last_group)
4979 return -EINVAL; 5060 return -EINVAL;
@@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4993 * change it for the last group in which case start + 5074 * change it for the last group in which case start +
4994 * len < EXT4_BLOCKS_PER_GROUP(sb). 5075 * len < EXT4_BLOCKS_PER_GROUP(sb).
4995 */ 5076 */
4996 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) 5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
4997 last_block = first_block + len; 5078 last_cluster = first_cluster + len;
4998 len -= last_block - first_block; 5079 len -= last_cluster - first_cluster;
4999 5080
5000 if (grp->bb_free >= minlen) { 5081 if (grp->bb_free >= minlen) {
5001 cnt = ext4_trim_all_free(sb, group, first_block, 5082 cnt = ext4_trim_all_free(sb, group, first_cluster,
5002 last_block, minlen); 5083 last_cluster, minlen);
5003 if (cnt < 0) { 5084 if (cnt < 0) {
5004 ret = cnt; 5085 ret = cnt;
5005 break; 5086 break;
5006 } 5087 }
5007 } 5088 }
5008 trimmed += cnt; 5089 trimmed += cnt;
5009 first_block = 0; 5090 first_cluster = 0;
5010 } 5091 }
5011 range->len = trimmed * sb->s_blocksize; 5092 range->len = trimmed * sb->s_blocksize;
5012 5093
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636b546..47705f3285e 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,7 +106,7 @@ struct ext4_free_data {
106 ext4_group_t group; 106 ext4_group_t group;
107 107
108 /* free block extent */ 108 /* free block extent */
109 ext4_grpblk_t start_blk; 109 ext4_grpblk_t start_cluster;
110 ext4_grpblk_t count; 110 ext4_grpblk_t count;
111 111
112 /* transaction which freed this extent */ 112 /* transaction which freed this extent */
@@ -139,9 +139,9 @@ enum {
139 139
140struct ext4_free_extent { 140struct ext4_free_extent {
141 ext4_lblk_t fe_logical; 141 ext4_lblk_t fe_logical;
142 ext4_grpblk_t fe_start; 142 ext4_grpblk_t fe_start; /* In cluster units */
143 ext4_group_t fe_group; 143 ext4_group_t fe_group;
144 ext4_grpblk_t fe_len; 144 ext4_grpblk_t fe_len; /* In cluster units */
145}; 145};
146 146
147/* 147/*
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
175 /* the best found extent */ 175 /* the best found extent */
176 struct ext4_free_extent ac_b_ex; 176 struct ext4_free_extent ac_b_ex;
177 177
178 /* copy of the bext found extent taken before preallocation efforts */ 178 /* copy of the best found extent taken before preallocation efforts */
179 struct ext4_free_extent ac_f_ex; 179 struct ext4_free_extent ac_f_ex;
180 180
181 /* number of iterations done. we have to track to limit searching */ 181 /* number of iterations done. we have to track to limit searching */
@@ -216,6 +216,7 @@ struct ext4_buddy {
216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
217 struct ext4_free_extent *fex) 217 struct ext4_free_extent *fex)
218{ 218{
219 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; 219 return ext4_group_first_block_no(sb, fex->fe_group) +
220 (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
220} 221}
221#endif 222#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d..16ac228dbec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,19 +15,18 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18#include "ext4_extents.h"
19 18
20/* 19/*
21 * The contiguous blocks details which can be 20 * The contiguous blocks details which can be
22 * represented by a single extent 21 * represented by a single extent
23 */ 22 */
24struct list_blocks_struct { 23struct migrate_struct {
25 ext4_lblk_t first_block, last_block; 24 ext4_lblk_t first_block, last_block, curr_block;
26 ext4_fsblk_t first_pblock, last_pblock; 25 ext4_fsblk_t first_pblock, last_pblock;
27}; 26};
28 27
29static int finish_range(handle_t *handle, struct inode *inode, 28static int finish_range(handle_t *handle, struct inode *inode,
30 struct list_blocks_struct *lb) 29 struct migrate_struct *lb)
31 30
32{ 31{
33 int retval = 0, needed; 32 int retval = 0, needed;
@@ -87,8 +86,7 @@ err_out:
87} 86}
88 87
89static int update_extent_range(handle_t *handle, struct inode *inode, 88static int update_extent_range(handle_t *handle, struct inode *inode,
90 ext4_fsblk_t pblock, ext4_lblk_t blk_num, 89 ext4_fsblk_t pblock, struct migrate_struct *lb)
91 struct list_blocks_struct *lb)
92{ 90{
93 int retval; 91 int retval;
94 /* 92 /*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
96 */ 94 */
97 if (lb->first_pblock && 95 if (lb->first_pblock &&
98 (lb->last_pblock+1 == pblock) && 96 (lb->last_pblock+1 == pblock) &&
99 (lb->last_block+1 == blk_num)) { 97 (lb->last_block+1 == lb->curr_block)) {
100 lb->last_pblock = pblock; 98 lb->last_pblock = pblock;
101 lb->last_block = blk_num; 99 lb->last_block = lb->curr_block;
100 lb->curr_block++;
102 return 0; 101 return 0;
103 } 102 }
104 /* 103 /*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
106 */ 105 */
107 retval = finish_range(handle, inode, lb); 106 retval = finish_range(handle, inode, lb);
108 lb->first_pblock = lb->last_pblock = pblock; 107 lb->first_pblock = lb->last_pblock = pblock;
109 lb->first_block = lb->last_block = blk_num; 108 lb->first_block = lb->last_block = lb->curr_block;
110 109 lb->curr_block++;
111 return retval; 110 return retval;
112} 111}
113 112
114static int update_ind_extent_range(handle_t *handle, struct inode *inode, 113static int update_ind_extent_range(handle_t *handle, struct inode *inode,
115 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 114 ext4_fsblk_t pblock,
116 struct list_blocks_struct *lb) 115 struct migrate_struct *lb)
117{ 116{
118 struct buffer_head *bh; 117 struct buffer_head *bh;
119 __le32 *i_data; 118 __le32 *i_data;
120 int i, retval = 0; 119 int i, retval = 0;
121 ext4_lblk_t blk_count = *blk_nump;
122 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 120 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
123 121
124 if (!pblock) {
125 /* Only update the file block number */
126 *blk_nump += max_entries;
127 return 0;
128 }
129
130 bh = sb_bread(inode->i_sb, pblock); 122 bh = sb_bread(inode->i_sb, pblock);
131 if (!bh) 123 if (!bh)
132 return -EIO; 124 return -EIO;
133 125
134 i_data = (__le32 *)bh->b_data; 126 i_data = (__le32 *)bh->b_data;
135 for (i = 0; i < max_entries; i++, blk_count++) { 127 for (i = 0; i < max_entries; i++) {
136 if (i_data[i]) { 128 if (i_data[i]) {
137 retval = update_extent_range(handle, inode, 129 retval = update_extent_range(handle, inode,
138 le32_to_cpu(i_data[i]), 130 le32_to_cpu(i_data[i]), lb);
139 blk_count, lb);
140 if (retval) 131 if (retval)
141 break; 132 break;
133 } else {
134 lb->curr_block++;
142 } 135 }
143 } 136 }
144
145 /* Update the file block number */
146 *blk_nump = blk_count;
147 put_bh(bh); 137 put_bh(bh);
148 return retval; 138 return retval;
149 139
150} 140}
151 141
152static int update_dind_extent_range(handle_t *handle, struct inode *inode, 142static int update_dind_extent_range(handle_t *handle, struct inode *inode,
153 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 143 ext4_fsblk_t pblock,
154 struct list_blocks_struct *lb) 144 struct migrate_struct *lb)
155{ 145{
156 struct buffer_head *bh; 146 struct buffer_head *bh;
157 __le32 *i_data; 147 __le32 *i_data;
158 int i, retval = 0; 148 int i, retval = 0;
159 ext4_lblk_t blk_count = *blk_nump;
160 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 149 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
161 150
162 if (!pblock) {
163 /* Only update the file block number */
164 *blk_nump += max_entries * max_entries;
165 return 0;
166 }
167 bh = sb_bread(inode->i_sb, pblock); 151 bh = sb_bread(inode->i_sb, pblock);
168 if (!bh) 152 if (!bh)
169 return -EIO; 153 return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
172 for (i = 0; i < max_entries; i++) { 156 for (i = 0; i < max_entries; i++) {
173 if (i_data[i]) { 157 if (i_data[i]) {
174 retval = update_ind_extent_range(handle, inode, 158 retval = update_ind_extent_range(handle, inode,
175 le32_to_cpu(i_data[i]), 159 le32_to_cpu(i_data[i]), lb);
176 &blk_count, lb);
177 if (retval) 160 if (retval)
178 break; 161 break;
179 } else { 162 } else {
180 /* Only update the file block number */ 163 /* Only update the file block number */
181 blk_count += max_entries; 164 lb->curr_block += max_entries;
182 } 165 }
183 } 166 }
184
185 /* Update the file block number */
186 *blk_nump = blk_count;
187 put_bh(bh); 167 put_bh(bh);
188 return retval; 168 return retval;
189 169
190} 170}
191 171
192static int update_tind_extent_range(handle_t *handle, struct inode *inode, 172static int update_tind_extent_range(handle_t *handle, struct inode *inode,
193 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 173 ext4_fsblk_t pblock,
194 struct list_blocks_struct *lb) 174 struct migrate_struct *lb)
195{ 175{
196 struct buffer_head *bh; 176 struct buffer_head *bh;
197 __le32 *i_data; 177 __le32 *i_data;
198 int i, retval = 0; 178 int i, retval = 0;
199 ext4_lblk_t blk_count = *blk_nump;
200 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 179 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
201 180
202 if (!pblock) {
203 /* Only update the file block number */
204 *blk_nump += max_entries * max_entries * max_entries;
205 return 0;
206 }
207 bh = sb_bread(inode->i_sb, pblock); 181 bh = sb_bread(inode->i_sb, pblock);
208 if (!bh) 182 if (!bh)
209 return -EIO; 183 return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
212 for (i = 0; i < max_entries; i++) { 186 for (i = 0; i < max_entries; i++) {
213 if (i_data[i]) { 187 if (i_data[i]) {
214 retval = update_dind_extent_range(handle, inode, 188 retval = update_dind_extent_range(handle, inode,
215 le32_to_cpu(i_data[i]), 189 le32_to_cpu(i_data[i]), lb);
216 &blk_count, lb);
217 if (retval) 190 if (retval)
218 break; 191 break;
219 } else 192 } else {
220 /* Only update the file block number */ 193 /* Only update the file block number */
221 blk_count += max_entries * max_entries; 194 lb->curr_block += max_entries * max_entries;
195 }
222 } 196 }
223 /* Update the file block number */
224 *blk_nump = blk_count;
225 put_bh(bh); 197 put_bh(bh);
226 return retval; 198 return retval;
227 199
@@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode)
462 handle_t *handle; 434 handle_t *handle;
463 int retval = 0, i; 435 int retval = 0, i;
464 __le32 *i_data; 436 __le32 *i_data;
465 ext4_lblk_t blk_count = 0;
466 struct ext4_inode_info *ei; 437 struct ext4_inode_info *ei;
467 struct inode *tmp_inode = NULL; 438 struct inode *tmp_inode = NULL;
468 struct list_blocks_struct lb; 439 struct migrate_struct lb;
469 unsigned long max_entries; 440 unsigned long max_entries;
470 __u32 goal; 441 __u32 goal;
442 uid_t owner[2];
471 443
472 /* 444 /*
473 * If the filesystem does not support extents, or the inode 445 * If the filesystem does not support extents, or the inode
@@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
495 } 467 }
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 468 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 469 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
470 owner[0] = inode->i_uid;
471 owner[1] = inode->i_gid;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 472 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, NULL, goal); 473 S_IFREG, NULL, goal, owner);
500 if (IS_ERR(tmp_inode)) { 474 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 475 retval = PTR_ERR(inode);
502 ext4_journal_stop(handle); 476 ext4_journal_stop(handle);
503 return retval; 477 return retval;
504 } 478 }
@@ -507,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
507 * Set the i_nlink to zero so it will be deleted later 481 * Set the i_nlink to zero so it will be deleted later
508 * when we drop inode reference. 482 * when we drop inode reference.
509 */ 483 */
510 tmp_inode->i_nlink = 0; 484 clear_nlink(tmp_inode);
511 485
512 ext4_ext_tree_init(handle, tmp_inode); 486 ext4_ext_tree_init(handle, tmp_inode);
513 ext4_orphan_add(handle, tmp_inode); 487 ext4_orphan_add(handle, tmp_inode);
@@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode)
551 525
552 /* 32 bit block address 4 bytes */ 526 /* 32 bit block address 4 bytes */
553 max_entries = inode->i_sb->s_blocksize >> 2; 527 max_entries = inode->i_sb->s_blocksize >> 2;
554 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { 528 for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
555 if (i_data[i]) { 529 if (i_data[i]) {
556 retval = update_extent_range(handle, tmp_inode, 530 retval = update_extent_range(handle, tmp_inode,
557 le32_to_cpu(i_data[i]), 531 le32_to_cpu(i_data[i]), &lb);
558 blk_count, &lb);
559 if (retval) 532 if (retval)
560 goto err_out; 533 goto err_out;
561 } 534 } else
535 lb.curr_block++;
562 } 536 }
563 if (i_data[EXT4_IND_BLOCK]) { 537 if (i_data[EXT4_IND_BLOCK]) {
564 retval = update_ind_extent_range(handle, tmp_inode, 538 retval = update_ind_extent_range(handle, tmp_inode,
565 le32_to_cpu(i_data[EXT4_IND_BLOCK]), 539 le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
566 &blk_count, &lb);
567 if (retval) 540 if (retval)
568 goto err_out; 541 goto err_out;
569 } else 542 } else
570 blk_count += max_entries; 543 lb.curr_block += max_entries;
571 if (i_data[EXT4_DIND_BLOCK]) { 544 if (i_data[EXT4_DIND_BLOCK]) {
572 retval = update_dind_extent_range(handle, tmp_inode, 545 retval = update_dind_extent_range(handle, tmp_inode,
573 le32_to_cpu(i_data[EXT4_DIND_BLOCK]), 546 le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
574 &blk_count, &lb);
575 if (retval) 547 if (retval)
576 goto err_out; 548 goto err_out;
577 } else 549 } else
578 blk_count += max_entries * max_entries; 550 lb.curr_block += max_entries * max_entries;
579 if (i_data[EXT4_TIND_BLOCK]) { 551 if (i_data[EXT4_TIND_BLOCK]) {
580 retval = update_tind_extent_range(handle, tmp_inode, 552 retval = update_tind_extent_range(handle, tmp_inode,
581 le32_to_cpu(i_data[EXT4_TIND_BLOCK]), 553 le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
582 &blk_count, &lb);
583 if (retval) 554 if (retval)
584 goto err_out; 555 goto err_out;
585 } 556 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f537c..7ea4ba4eff2 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname); 110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111 111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname, 112 memcpy(mmp->mmp_nodename, init_utsname()->nodename,
113 sizeof(mmp->mmp_nodename)); 113 sizeof(mmp->mmp_nodename));
114 114
115 while (!kthread_should_stop()) { 115 while (!kthread_should_stop()) {
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
125 * Don't spew too many error messages. Print one every 125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds. 126 * (s_mmp_update_interval * 60) seconds.
127 */ 127 */
128 if (retval && (failed_writes % 60) == 0) { 128 if (retval) {
129 ext4_error(sb, "Error writing to MMP block"); 129 if ((failed_writes % 60) == 0)
130 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++; 131 failed_writes++;
131 } 132 }
132 133
@@ -295,7 +296,8 @@ skip:
295 /* 296 /*
296 * write a new random sequence number. 297 * write a new random sequence number.
297 */ 298 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); 299 seq = mmp_new_seq();
300 mmp->mmp_seq = cpu_to_le32(seq);
299 301
300 retval = write_mmp_block(bh); 302 retval = write_mmp_block(bh);
301 if (retval) 303 if (retval)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b..c5826c623e7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
20#include "ext4_extents.h"
21#include "ext4.h" 20#include "ext4.h"
22 21
23/** 22/**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924faeb6c..aa4c782c9dd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1586 dxtrace(dx_show_index("node", frames[1].entries)); 1586 dxtrace(dx_show_index("node", frames[1].entries));
1587 dxtrace(dx_show_index("node", 1587 dxtrace(dx_show_index("node",
1588 ((struct dx_node *) bh2->b_data)->entries)); 1588 ((struct dx_node *) bh2->b_data)->entries));
1589 err = ext4_handle_dirty_metadata(handle, inode, bh2); 1589 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1590 if (err) 1590 if (err)
1591 goto journal_error; 1591 goto journal_error;
1592 brelse (bh2); 1592 brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1612 if (err) 1612 if (err)
1613 goto journal_error; 1613 goto journal_error;
1614 } 1614 }
1615 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1615 err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
1616 if (err) { 1616 if (err) {
1617 ext4_std_error(inode->i_sb, err); 1617 ext4_std_error(inode->i_sb, err);
1618 goto cleanup; 1618 goto cleanup;
@@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
1694 if (is_dx(inode) && inode->i_nlink > 1) { 1694 if (is_dx(inode) && inode->i_nlink > 1) {
1695 /* limit is 16-bit i_links_count */ 1695 /* limit is 16-bit i_links_count */
1696 if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { 1696 if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
1697 inode->i_nlink = 1; 1697 set_nlink(inode, 1);
1698 EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, 1698 EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
1699 EXT4_FEATURE_RO_COMPAT_DIR_NLINK); 1699 EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
1700 } 1700 }
@@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
1707 */ 1707 */
1708static void ext4_dec_count(handle_t *handle, struct inode *inode) 1708static void ext4_dec_count(handle_t *handle, struct inode *inode)
1709{ 1709{
1710 drop_nlink(inode); 1710 if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
1711 if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) 1711 drop_nlink(inode);
1712 inc_nlink(inode);
1713} 1712}
1714 1713
1715 1714
@@ -1756,7 +1755,7 @@ retry:
1756 if (IS_DIRSYNC(dir)) 1755 if (IS_DIRSYNC(dir))
1757 ext4_handle_sync(handle); 1756 ext4_handle_sync(handle);
1758 1757
1759 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); 1758 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1760 err = PTR_ERR(inode); 1759 err = PTR_ERR(inode);
1761 if (!IS_ERR(inode)) { 1760 if (!IS_ERR(inode)) {
1762 inode->i_op = &ext4_file_inode_operations; 1761 inode->i_op = &ext4_file_inode_operations;
@@ -1792,7 +1791,7 @@ retry:
1792 if (IS_DIRSYNC(dir)) 1791 if (IS_DIRSYNC(dir))
1793 ext4_handle_sync(handle); 1792 ext4_handle_sync(handle);
1794 1793
1795 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); 1794 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1796 err = PTR_ERR(inode); 1795 err = PTR_ERR(inode);
1797 if (!IS_ERR(inode)) { 1796 if (!IS_ERR(inode)) {
1798 init_special_inode(inode, inode->i_mode, rdev); 1797 init_special_inode(inode, inode->i_mode, rdev);
@@ -1832,7 +1831,7 @@ retry:
1832 ext4_handle_sync(handle); 1831 ext4_handle_sync(handle);
1833 1832
1834 inode = ext4_new_inode(handle, dir, S_IFDIR | mode, 1833 inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
1835 &dentry->d_name, 0); 1834 &dentry->d_name, 0, NULL);
1836 err = PTR_ERR(inode); 1835 err = PTR_ERR(inode);
1837 if (IS_ERR(inode)) 1836 if (IS_ERR(inode))
1838 goto out_stop; 1837 goto out_stop;
@@ -1861,9 +1860,9 @@ retry:
1861 de->name_len = 2; 1860 de->name_len = 2;
1862 strcpy(de->name, ".."); 1861 strcpy(de->name, "..");
1863 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1862 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1864 inode->i_nlink = 2; 1863 set_nlink(inode, 2);
1865 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1864 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1866 err = ext4_handle_dirty_metadata(handle, dir, dir_block); 1865 err = ext4_handle_dirty_metadata(handle, inode, dir_block);
1867 if (err) 1866 if (err)
1868 goto out_clear_inode; 1867 goto out_clear_inode;
1869 err = ext4_mark_inode_dirty(handle, inode); 1868 err = ext4_mark_inode_dirty(handle, inode);
@@ -2214,7 +2213,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2214 ext4_warning(inode->i_sb, 2213 ext4_warning(inode->i_sb,
2215 "Deleting nonexistent file (%lu), %d", 2214 "Deleting nonexistent file (%lu), %d",
2216 inode->i_ino, inode->i_nlink); 2215 inode->i_ino, inode->i_nlink);
2217 inode->i_nlink = 1; 2216 set_nlink(inode, 1);
2218 } 2217 }
2219 retval = ext4_delete_entry(handle, dir, de, bh); 2218 retval = ext4_delete_entry(handle, dir, de, bh);
2220 if (retval) 2219 if (retval)
@@ -2279,7 +2278,7 @@ retry:
2279 ext4_handle_sync(handle); 2278 ext4_handle_sync(handle);
2280 2279
2281 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, 2280 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2282 &dentry->d_name, 0); 2281 &dentry->d_name, 0, NULL);
2283 err = PTR_ERR(inode); 2282 err = PTR_ERR(inode);
2284 if (IS_ERR(inode)) 2283 if (IS_ERR(inode))
2285 goto out_stop; 2284 goto out_stop;
@@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2530 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2529 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2531 cpu_to_le32(new_dir->i_ino); 2530 cpu_to_le32(new_dir->i_ino);
2532 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2531 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2533 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2532 retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
2534 if (retval) { 2533 if (retval) {
2535 ext4_std_error(old_dir->i_sb, retval); 2534 ext4_std_error(old_dir->i_sb, retval);
2536 goto end_rename; 2535 goto end_rename;
@@ -2539,7 +2538,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2539 if (new_inode) { 2538 if (new_inode) {
2540 /* checked empty_dir above, can't have another parent, 2539 /* checked empty_dir above, can't have another parent,
2541 * ext4_dec_count() won't work for many-linked dirs */ 2540 * ext4_dec_count() won't work for many-linked dirs */
2542 new_inode->i_nlink = 0; 2541 clear_nlink(new_inode);
2543 } else { 2542 } else {
2544 ext4_inc_count(handle, new_dir); 2543 ext4_inc_count(handle, new_dir);
2545 ext4_update_dx_flag(new_dir); 2544 ext4_update_dx_flag(new_dir);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee13f8..7ce1d0b19c9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
70void ext4_free_io_end(ext4_io_end_t *io) 70void ext4_free_io_end(ext4_io_end_t *io)
71{ 71{
72 int i; 72 int i;
73 wait_queue_head_t *wq;
74 73
75 BUG_ON(!io); 74 BUG_ON(!io);
76 if (io->page) 75 if (io->page)
@@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io)
78 for (i = 0; i < io->num_io_pages; i++) 77 for (i = 0; i < io->num_io_pages; i++)
79 put_io_page(io->pages[i]); 78 put_io_page(io->pages[i]);
80 io->num_io_pages = 0; 79 io->num_io_pages = 0;
81 wq = ext4_ioend_wq(io->inode); 80 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 81 wake_up_all(ext4_ioend_wq(io->inode));
83 waitqueue_active(wq))
84 wake_up_all(wq);
85 kmem_cache_free(io_end_cachep, io); 82 kmem_cache_free(io_end_cachep, io);
86} 83}
87 84
88/* 85/*
89 * check a range of space and convert unwritten extents to written. 86 * check a range of space and convert unwritten extents to written.
87 *
88 * Called with inode->i_mutex; we depend on this when we manipulate
89 * io->flag, since we could otherwise race with ext4_flush_completed_IO()
90 */ 90 */
91int ext4_end_io_nolock(ext4_io_end_t *io) 91int ext4_end_io_nolock(ext4_io_end_t *io)
92{ 92{
93 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
94 loff_t offset = io->offset; 94 loff_t offset = io->offset;
95 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
97 int ret = 0; 96 int ret = 0;
98 97
99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 98 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
100 "list->prev 0x%p\n", 99 "list->prev 0x%p\n",
101 io, inode->i_ino, io->list.next, io->list.prev); 100 io, inode->i_ino, io->list.next, io->list.prev);
102 101
103 if (list_empty(&io->list))
104 return ret;
105
106 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
107 return ret;
108
109 ret = ext4_convert_unwritten_extents(inode, offset, size); 102 ret = ext4_convert_unwritten_extents(inode, offset, size);
110 if (ret < 0) { 103 if (ret < 0) {
111 printk(KERN_EMERG "%s: failed to convert unwritten " 104 ext4_msg(inode->i_sb, KERN_EMERG,
112 "extents to written extents, error is %d " 105 "failed to convert unwritten extents to written "
113 "io is still on inode %lu aio dio list\n", 106 "extents -- potential data loss! "
114 __func__, ret, inode->i_ino); 107 "(inode %lu, offset %llu, size %zd, error %d)",
115 return ret; 108 inode->i_ino, offset, size, ret);
116 } 109 }
117 110
118 if (io->iocb) 111 if (io->iocb)
119 aio_complete(io->iocb, io->result, 0); 112 aio_complete(io->iocb, io->result, 0);
120 /* clear the DIO AIO unwritten flag */
121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130 113
114 /* Wake up anyone waiting on unwritten extent conversion */
115 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
116 wake_up_all(ext4_ioend_wq(io->inode));
131 return ret; 117 return ret;
132} 118}
133 119
@@ -140,9 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
140 struct inode *inode = io->inode; 126 struct inode *inode = io->inode;
141 struct ext4_inode_info *ei = EXT4_I(inode); 127 struct ext4_inode_info *ei = EXT4_I(inode);
142 unsigned long flags; 128 unsigned long flags;
143 int ret; 129
130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (list_empty(&io->list)) {
132 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
133 goto free;
134 }
144 135
145 if (!mutex_trylock(&inode->i_mutex)) { 136 if (!mutex_trylock(&inode->i_mutex)) {
137 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
146 /* 138 /*
147 * Requeue the work instead of waiting so that the work 139 * Requeue the work instead of waiting so that the work
148 * items queued after this can be processed. 140 * items queued after this can be processed.
@@ -159,17 +151,11 @@ static void ext4_end_io_work(struct work_struct *work)
159 io->flag |= EXT4_IO_END_QUEUED; 151 io->flag |= EXT4_IO_END_QUEUED;
160 return; 152 return;
161 } 153 }
162 ret = ext4_end_io_nolock(io); 154 list_del_init(&io->list);
163 if (ret < 0) {
164 mutex_unlock(&inode->i_mutex);
165 return;
166 }
167
168 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
169 if (!list_empty(&io->list))
170 list_del_init(&io->list);
171 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 155 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
156 (void) ext4_end_io_nolock(io);
172 mutex_unlock(&inode->i_mutex); 157 mutex_unlock(&inode->i_mutex);
158free:
173 ext4_free_io_end(io); 159 ext4_free_io_end(io);
174} 160}
175 161
@@ -350,10 +336,8 @@ submit_and_retry:
350 if ((io_end->num_io_pages >= MAX_IO_PAGES) && 336 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
351 (io_end->pages[io_end->num_io_pages-1] != io_page)) 337 (io_end->pages[io_end->num_io_pages-1] != io_page))
352 goto submit_and_retry; 338 goto submit_and_retry;
353 if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 339 if (buffer_uninit(bh))
354 io_end->flag |= EXT4_IO_END_UNWRITTEN; 340 ext4_set_io_unwritten_flag(inode, io_end);
355 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
356 }
357 io->io_end->size += bh->b_size; 341 io->io_end->size += bh->b_size;
358 io->io_next_block++; 342 io->io_next_block++;
359 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 343 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 707d3f16f7c..996780ab4f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 875 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
876 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 876 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
877 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 877 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
878 ext4_free_blks_set(sb, gdp, input->free_blocks_count); 878 ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
879 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 879 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
880 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); 880 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
881 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 881 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 input->reserved_blocks); 937 input->reserved_blocks);
938 938
939 /* Update the free space counts */ 939 /* Update the free space counts */
940 percpu_counter_add(&sbi->s_freeblocks_counter, 940 percpu_counter_add(&sbi->s_freeclusters_counter,
941 input->free_blocks_count); 941 EXT4_B2C(sbi, input->free_blocks_count));
942 percpu_counter_add(&sbi->s_freeinodes_counter, 942 percpu_counter_add(&sbi->s_freeinodes_counter,
943 EXT4_INODES_PER_GROUP(sb)); 943 EXT4_INODES_PER_GROUP(sb));
944 944
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
946 sbi->s_log_groups_per_flex) { 946 sbi->s_log_groups_per_flex) {
947 ext4_group_t flex_group; 947 ext4_group_t flex_group;
948 flex_group = ext4_flex_group(sbi, input->group); 948 flex_group = ext4_flex_group(sbi, input->group);
949 atomic_add(input->free_blocks_count, 949 atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
950 &sbi->s_flex_groups[flex_group].free_blocks); 950 &sbi->s_flex_groups[flex_group].free_clusters);
951 atomic_add(EXT4_INODES_PER_GROUP(sb), 951 atomic_add(EXT4_INODES_PER_GROUP(sb),
952 &sbi->s_flex_groups[flex_group].free_inodes); 952 &sbi->s_flex_groups[flex_group].free_inodes);
953 } 953 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8db223..9953d80145a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include <linux/freezer.h> 45#include <linux/freezer.h>
46 46
47#include "ext4.h" 47#include "ext4.h"
48#include "ext4_extents.h"
48#include "ext4_jbd2.h" 49#include "ext4_jbd2.h"
49#include "xattr.h" 50#include "xattr.h"
50#include "acl.h" 51#include "acl.h"
@@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
163 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 164 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
164} 165}
165 166
166__u32 ext4_free_blks_count(struct super_block *sb, 167__u32 ext4_free_group_clusters(struct super_block *sb,
167 struct ext4_group_desc *bg) 168 struct ext4_group_desc *bg)
168{ 169{
169 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 170 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
170 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 171 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb,
219 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 220 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
220} 221}
221 222
222void ext4_free_blks_set(struct super_block *sb, 223void ext4_free_group_clusters_set(struct super_block *sb,
223 struct ext4_group_desc *bg, __u32 count) 224 struct ext4_group_desc *bg, __u32 count)
224{ 225{
225 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); 226 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
226 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 227 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func,
414 ext4_commit_super(sb, 1); 415 ext4_commit_super(sb, 1);
415} 416}
416 417
418/*
419 * The del_gendisk() function uninitializes the disk-specific data
420 * structures, including the bdi structure, without telling anyone
421 * else. Once this happens, any attempt to call mark_buffer_dirty()
422 * (for example, by ext4_commit_super), will cause a kernel OOPS.
423 * This is a kludge to prevent these oops until we can put in a proper
424 * hook in del_gendisk() to inform the VFS and file system layers.
425 */
426static int block_device_ejected(struct super_block *sb)
427{
428 struct inode *bd_inode = sb->s_bdev->bd_inode;
429 struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
430
431 return bdi->dev == NULL;
432}
433
417 434
418/* Deal with the reporting of failure conditions on a filesystem such as 435/* Deal with the reporting of failure conditions on a filesystem such as
419 * inconsistencies detected or read IO failures. 436 * inconsistencies detected or read IO failures.
@@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb)
821 brelse(sbi->s_group_desc[i]); 838 brelse(sbi->s_group_desc[i]);
822 ext4_kvfree(sbi->s_group_desc); 839 ext4_kvfree(sbi->s_group_desc);
823 ext4_kvfree(sbi->s_flex_groups); 840 ext4_kvfree(sbi->s_flex_groups);
824 percpu_counter_destroy(&sbi->s_freeblocks_counter); 841 percpu_counter_destroy(&sbi->s_freeclusters_counter);
825 percpu_counter_destroy(&sbi->s_freeinodes_counter); 842 percpu_counter_destroy(&sbi->s_freeinodes_counter);
826 percpu_counter_destroy(&sbi->s_dirs_counter); 843 percpu_counter_destroy(&sbi->s_dirs_counter);
827 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 844 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
828 brelse(sbi->s_sbh); 845 brelse(sbi->s_sbh);
829#ifdef CONFIG_QUOTA 846#ifdef CONFIG_QUOTA
830 for (i = 0; i < MAXQUOTAS; i++) 847 for (i = 0; i < MAXQUOTAS; i++)
@@ -1057,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1057 seq_puts(seq, ",nouid32"); 1074 seq_puts(seq, ",nouid32");
1058 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) 1075 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1059 seq_puts(seq, ",debug"); 1076 seq_puts(seq, ",debug");
1060 if (test_opt(sb, OLDALLOC))
1061 seq_puts(seq, ",oldalloc");
1062#ifdef CONFIG_EXT4_FS_XATTR 1077#ifdef CONFIG_EXT4_FS_XATTR
1063 if (test_opt(sb, XATTR_USER)) 1078 if (test_opt(sb, XATTR_USER))
1064 seq_puts(seq, ",user_xattr"); 1079 seq_puts(seq, ",user_xattr");
@@ -1567,10 +1582,12 @@ static int parse_options(char *options, struct super_block *sb,
1567 set_opt(sb, DEBUG); 1582 set_opt(sb, DEBUG);
1568 break; 1583 break;
1569 case Opt_oldalloc: 1584 case Opt_oldalloc:
1570 set_opt(sb, OLDALLOC); 1585 ext4_msg(sb, KERN_WARNING,
1586 "Ignoring deprecated oldalloc option");
1571 break; 1587 break;
1572 case Opt_orlov: 1588 case Opt_orlov:
1573 clear_opt(sb, OLDALLOC); 1589 ext4_msg(sb, KERN_WARNING,
1590 "Ignoring deprecated orlov option");
1574 break; 1591 break;
1575#ifdef CONFIG_EXT4_FS_XATTR 1592#ifdef CONFIG_EXT4_FS_XATTR
1576 case Opt_user_xattr: 1593 case Opt_user_xattr:
@@ -1801,6 +1818,7 @@ set_qf_format:
1801 break; 1818 break;
1802 case Opt_nodelalloc: 1819 case Opt_nodelalloc:
1803 clear_opt(sb, DELALLOC); 1820 clear_opt(sb, DELALLOC);
1821 clear_opt2(sb, EXPLICIT_DELALLOC);
1804 break; 1822 break;
1805 case Opt_mblk_io_submit: 1823 case Opt_mblk_io_submit:
1806 set_opt(sb, MBLK_IO_SUBMIT); 1824 set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1835,7 @@ set_qf_format:
1817 break; 1835 break;
1818 case Opt_delalloc: 1836 case Opt_delalloc:
1819 set_opt(sb, DELALLOC); 1837 set_opt(sb, DELALLOC);
1838 set_opt2(sb, EXPLICIT_DELALLOC);
1820 break; 1839 break;
1821 case Opt_block_validity: 1840 case Opt_block_validity:
1822 set_opt(sb, BLOCK_VALIDITY); 1841 set_opt(sb, BLOCK_VALIDITY);
@@ -1935,7 +1954,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1935 res = MS_RDONLY; 1954 res = MS_RDONLY;
1936 } 1955 }
1937 if (read_only) 1956 if (read_only)
1938 return res; 1957 goto done;
1939 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 1958 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1940 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " 1959 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1941 "running e2fsck is recommended"); 1960 "running e2fsck is recommended");
@@ -1966,6 +1985,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1966 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1985 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1967 1986
1968 ext4_commit_super(sb, 1); 1987 ext4_commit_super(sb, 1);
1988done:
1969 if (test_opt(sb, DEBUG)) 1989 if (test_opt(sb, DEBUG))
1970 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1990 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1971 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", 1991 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2015,8 +2035,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
2015 flex_group = ext4_flex_group(sbi, i); 2035 flex_group = ext4_flex_group(sbi, i);
2016 atomic_add(ext4_free_inodes_count(sb, gdp), 2036 atomic_add(ext4_free_inodes_count(sb, gdp),
2017 &sbi->s_flex_groups[flex_group].free_inodes); 2037 &sbi->s_flex_groups[flex_group].free_inodes);
2018 atomic_add(ext4_free_blks_count(sb, gdp), 2038 atomic_add(ext4_free_group_clusters(sb, gdp),
2019 &sbi->s_flex_groups[flex_group].free_blocks); 2039 &sbi->s_flex_groups[flex_group].free_clusters);
2020 atomic_add(ext4_used_dirs_count(sb, gdp), 2040 atomic_add(ext4_used_dirs_count(sb, gdp),
2021 &sbi->s_flex_groups[flex_group].used_dirs); 2041 &sbi->s_flex_groups[flex_group].used_dirs);
2022 } 2042 }
@@ -2134,7 +2154,8 @@ static int ext4_check_descriptors(struct super_block *sb,
2134 if (NULL != first_not_zeroed) 2154 if (NULL != first_not_zeroed)
2135 *first_not_zeroed = grp; 2155 *first_not_zeroed = grp;
2136 2156
2137 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2157 ext4_free_blocks_count_set(sbi->s_es,
2158 EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
2138 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2159 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
2139 return 1; 2160 return 1;
2140} 2161}
@@ -2454,7 +2475,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2454 char *buf) 2475 char *buf)
2455{ 2476{
2456 return snprintf(buf, PAGE_SIZE, "%llu\n", 2477 return snprintf(buf, PAGE_SIZE, "%llu\n",
2457 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2478 (s64) EXT4_C2B(sbi,
2479 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
2458} 2480}
2459 2481
2460static ssize_t session_write_kbytes_show(struct ext4_attr *a, 2482static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2682,6 +2704,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2682 return 0; 2704 return 0;
2683 } 2705 }
2684 } 2706 }
2707 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
2708 !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2709 ext4_msg(sb, KERN_ERR,
2710 "Can't support bigalloc feature without "
2711 "extents feature\n");
2712 return 0;
2713 }
2685 return 1; 2714 return 1;
2686} 2715}
2687 2716
@@ -3087,10 +3116,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3087 char *cp; 3116 char *cp;
3088 const char *descr; 3117 const char *descr;
3089 int ret = -ENOMEM; 3118 int ret = -ENOMEM;
3090 int blocksize; 3119 int blocksize, clustersize;
3091 unsigned int db_count; 3120 unsigned int db_count;
3092 unsigned int i; 3121 unsigned int i;
3093 int needs_recovery, has_huge_files; 3122 int needs_recovery, has_huge_files, has_bigalloc;
3094 __u64 blocks_count; 3123 __u64 blocks_count;
3095 int err; 3124 int err;
3096 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3125 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3224,6 +3253,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3224 &journal_ioprio, NULL, 0)) 3253 &journal_ioprio, NULL, 0))
3225 goto failed_mount; 3254 goto failed_mount;
3226 3255
3256 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3257 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
3258 "with data=journal disables delayed "
3259 "allocation and O_DIRECT support!\n");
3260 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
3261 ext4_msg(sb, KERN_ERR, "can't mount with "
3262 "both data=journal and delalloc");
3263 goto failed_mount;
3264 }
3265 if (test_opt(sb, DIOREAD_NOLOCK)) {
3266 ext4_msg(sb, KERN_ERR, "can't mount with "
3267 "both data=journal and delalloc");
3268 goto failed_mount;
3269 }
3270 if (test_opt(sb, DELALLOC))
3271 clear_opt(sb, DELALLOC);
3272 }
3273
3274 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3275 if (test_opt(sb, DIOREAD_NOLOCK)) {
3276 if (blocksize < PAGE_SIZE) {
3277 ext4_msg(sb, KERN_ERR, "can't mount with "
3278 "dioread_nolock if block size != PAGE_SIZE");
3279 goto failed_mount;
3280 }
3281 }
3282
3227 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3283 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3228 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3284 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3229 3285
@@ -3265,8 +3321,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3265 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) 3321 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3266 goto failed_mount; 3322 goto failed_mount;
3267 3323
3268 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3269
3270 if (blocksize < EXT4_MIN_BLOCK_SIZE || 3324 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3271 blocksize > EXT4_MAX_BLOCK_SIZE) { 3325 blocksize > EXT4_MAX_BLOCK_SIZE) {
3272 ext4_msg(sb, KERN_ERR, 3326 ext4_msg(sb, KERN_ERR,
@@ -3369,12 +3423,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3369 sb->s_dirt = 1; 3423 sb->s_dirt = 1;
3370 } 3424 }
3371 3425
3372 if (sbi->s_blocks_per_group > blocksize * 8) { 3426 /* Handle clustersize */
3373 ext4_msg(sb, KERN_ERR, 3427 clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
3374 "#blocks per group too big: %lu", 3428 has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3375 sbi->s_blocks_per_group); 3429 EXT4_FEATURE_RO_COMPAT_BIGALLOC);
3376 goto failed_mount; 3430 if (has_bigalloc) {
3431 if (clustersize < blocksize) {
3432 ext4_msg(sb, KERN_ERR,
3433 "cluster size (%d) smaller than "
3434 "block size (%d)", clustersize, blocksize);
3435 goto failed_mount;
3436 }
3437 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
3438 le32_to_cpu(es->s_log_block_size);
3439 sbi->s_clusters_per_group =
3440 le32_to_cpu(es->s_clusters_per_group);
3441 if (sbi->s_clusters_per_group > blocksize * 8) {
3442 ext4_msg(sb, KERN_ERR,
3443 "#clusters per group too big: %lu",
3444 sbi->s_clusters_per_group);
3445 goto failed_mount;
3446 }
3447 if (sbi->s_blocks_per_group !=
3448 (sbi->s_clusters_per_group * (clustersize / blocksize))) {
3449 ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
3450 "clusters per group (%lu) inconsistent",
3451 sbi->s_blocks_per_group,
3452 sbi->s_clusters_per_group);
3453 goto failed_mount;
3454 }
3455 } else {
3456 if (clustersize != blocksize) {
3457 ext4_warning(sb, "fragment/cluster size (%d) != "
3458 "block size (%d)", clustersize,
3459 blocksize);
3460 clustersize = blocksize;
3461 }
3462 if (sbi->s_blocks_per_group > blocksize * 8) {
3463 ext4_msg(sb, KERN_ERR,
3464 "#blocks per group too big: %lu",
3465 sbi->s_blocks_per_group);
3466 goto failed_mount;
3467 }
3468 sbi->s_clusters_per_group = sbi->s_blocks_per_group;
3469 sbi->s_cluster_bits = 0;
3377 } 3470 }
3471 sbi->s_cluster_ratio = clustersize / blocksize;
3472
3378 if (sbi->s_inodes_per_group > blocksize * 8) { 3473 if (sbi->s_inodes_per_group > blocksize * 8) {
3379 ext4_msg(sb, KERN_ERR, 3474 ext4_msg(sb, KERN_ERR,
3380 "#inodes per group too big: %lu", 3475 "#inodes per group too big: %lu",
@@ -3446,10 +3541,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3446 goto failed_mount; 3541 goto failed_mount;
3447 } 3542 }
3448 3543
3449#ifdef CONFIG_PROC_FS
3450 if (ext4_proc_root) 3544 if (ext4_proc_root)
3451 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 3545 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3452#endif
3453 3546
3454 bgl_lock_init(sbi->s_blockgroup_lock); 3547 bgl_lock_init(sbi->s_blockgroup_lock);
3455 3548
@@ -3483,8 +3576,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3483 sbi->s_err_report.function = print_daily_error_info; 3576 sbi->s_err_report.function = print_daily_error_info;
3484 sbi->s_err_report.data = (unsigned long) sb; 3577 sbi->s_err_report.data = (unsigned long) sb;
3485 3578
3486 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3579 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3487 ext4_count_free_blocks(sb)); 3580 ext4_count_free_clusters(sb));
3488 if (!err) { 3581 if (!err) {
3489 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3582 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3490 ext4_count_free_inodes(sb)); 3583 ext4_count_free_inodes(sb));
@@ -3494,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3494 ext4_count_dirs(sb)); 3587 ext4_count_dirs(sb));
3495 } 3588 }
3496 if (!err) { 3589 if (!err) {
3497 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3590 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
3498 } 3591 }
3499 if (err) { 3592 if (err) {
3500 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3593 ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3609,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3609 * The journal may have updated the bg summary counts, so we 3702 * The journal may have updated the bg summary counts, so we
3610 * need to update the global counters. 3703 * need to update the global counters.
3611 */ 3704 */
3612 percpu_counter_set(&sbi->s_freeblocks_counter, 3705 percpu_counter_set(&sbi->s_freeclusters_counter,
3613 ext4_count_free_blocks(sb)); 3706 ext4_count_free_clusters(sb));
3614 percpu_counter_set(&sbi->s_freeinodes_counter, 3707 percpu_counter_set(&sbi->s_freeinodes_counter,
3615 ext4_count_free_inodes(sb)); 3708 ext4_count_free_inodes(sb));
3616 percpu_counter_set(&sbi->s_dirs_counter, 3709 percpu_counter_set(&sbi->s_dirs_counter,
3617 ext4_count_dirs(sb)); 3710 ext4_count_dirs(sb));
3618 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); 3711 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
3619 3712
3620no_journal: 3713no_journal:
3621 /* 3714 /*
@@ -3679,25 +3772,6 @@ no_journal:
3679 "available"); 3772 "available");
3680 } 3773 }
3681 3774
3682 if (test_opt(sb, DELALLOC) &&
3683 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3684 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3685 "requested data journaling mode");
3686 clear_opt(sb, DELALLOC);
3687 }
3688 if (test_opt(sb, DIOREAD_NOLOCK)) {
3689 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3690 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3691 "option - requested data journaling mode");
3692 clear_opt(sb, DIOREAD_NOLOCK);
3693 }
3694 if (sb->s_blocksize < PAGE_SIZE) {
3695 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3696 "option - block size is too small");
3697 clear_opt(sb, DIOREAD_NOLOCK);
3698 }
3699 }
3700
3701 err = ext4_setup_system_zone(sb); 3775 err = ext4_setup_system_zone(sb);
3702 if (err) { 3776 if (err) {
3703 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3777 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3710,22 +3784,19 @@ no_journal:
3710 if (err) { 3784 if (err) {
3711 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", 3785 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3712 err); 3786 err);
3713 goto failed_mount4; 3787 goto failed_mount5;
3714 } 3788 }
3715 3789
3716 err = ext4_register_li_request(sb, first_not_zeroed); 3790 err = ext4_register_li_request(sb, first_not_zeroed);
3717 if (err) 3791 if (err)
3718 goto failed_mount4; 3792 goto failed_mount6;
3719 3793
3720 sbi->s_kobj.kset = ext4_kset; 3794 sbi->s_kobj.kset = ext4_kset;
3721 init_completion(&sbi->s_kobj_unregister); 3795 init_completion(&sbi->s_kobj_unregister);
3722 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3796 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
3723 "%s", sb->s_id); 3797 "%s", sb->s_id);
3724 if (err) { 3798 if (err)
3725 ext4_mb_release(sb); 3799 goto failed_mount7;
3726 ext4_ext_release(sb);
3727 goto failed_mount4;
3728 };
3729 3800
3730 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 3801 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
3731 ext4_orphan_cleanup(sb, es); 3802 ext4_orphan_cleanup(sb, es);
@@ -3759,13 +3830,19 @@ cantfind_ext4:
3759 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 3830 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
3760 goto failed_mount; 3831 goto failed_mount;
3761 3832
3833failed_mount7:
3834 ext4_unregister_li_request(sb);
3835failed_mount6:
3836 ext4_ext_release(sb);
3837failed_mount5:
3838 ext4_mb_release(sb);
3839 ext4_release_system_zone(sb);
3762failed_mount4: 3840failed_mount4:
3763 iput(root); 3841 iput(root);
3764 sb->s_root = NULL; 3842 sb->s_root = NULL;
3765 ext4_msg(sb, KERN_ERR, "mount failed"); 3843 ext4_msg(sb, KERN_ERR, "mount failed");
3766 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3844 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3767failed_mount_wq: 3845failed_mount_wq:
3768 ext4_release_system_zone(sb);
3769 if (sbi->s_journal) { 3846 if (sbi->s_journal) {
3770 jbd2_journal_destroy(sbi->s_journal); 3847 jbd2_journal_destroy(sbi->s_journal);
3771 sbi->s_journal = NULL; 3848 sbi->s_journal = NULL;
@@ -3774,10 +3851,10 @@ failed_mount3:
3774 del_timer(&sbi->s_err_report); 3851 del_timer(&sbi->s_err_report);
3775 if (sbi->s_flex_groups) 3852 if (sbi->s_flex_groups)
3776 ext4_kvfree(sbi->s_flex_groups); 3853 ext4_kvfree(sbi->s_flex_groups);
3777 percpu_counter_destroy(&sbi->s_freeblocks_counter); 3854 percpu_counter_destroy(&sbi->s_freeclusters_counter);
3778 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3855 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3779 percpu_counter_destroy(&sbi->s_dirs_counter); 3856 percpu_counter_destroy(&sbi->s_dirs_counter);
3780 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3857 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
3781 if (sbi->s_mmp_tsk) 3858 if (sbi->s_mmp_tsk)
3782 kthread_stop(sbi->s_mmp_tsk); 3859 kthread_stop(sbi->s_mmp_tsk);
3783failed_mount2: 3860failed_mount2:
@@ -4064,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4064 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 4141 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4065 int error = 0; 4142 int error = 0;
4066 4143
4067 if (!sbh) 4144 if (!sbh || block_device_ejected(sb))
4068 return error; 4145 return error;
4069 if (buffer_write_io_error(sbh)) { 4146 if (buffer_write_io_error(sbh)) {
4070 /* 4147 /*
@@ -4100,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4100 else 4177 else
4101 es->s_kbytes_written = 4178 es->s_kbytes_written =
4102 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 4179 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4103 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 4180 ext4_free_blocks_count_set(es,
4104 &EXT4_SB(sb)->s_freeblocks_counter)); 4181 EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4182 &EXT4_SB(sb)->s_freeclusters_counter)));
4105 es->s_free_inodes_count = 4183 es->s_free_inodes_count =
4106 cpu_to_le32(percpu_counter_sum_positive( 4184 cpu_to_le32(percpu_counter_sum_positive(
4107 &EXT4_SB(sb)->s_freeinodes_counter)); 4185 &EXT4_SB(sb)->s_freeinodes_counter));
@@ -4506,16 +4584,34 @@ restore_opts:
4506 return err; 4584 return err;
4507} 4585}
4508 4586
4587/*
4588 * Note: calculating the overhead so we can be compatible with
4589 * historical BSD practice is quite difficult in the face of
4590 * clusters/bigalloc. This is because multiple metadata blocks from
4591 * different block group can end up in the same allocation cluster.
4592 * Calculating the exact overhead in the face of clustered allocation
4593 * requires either O(all block bitmaps) in memory or O(number of block
4594 * groups**2) in time. We will still calculate the superblock for
4595 * older file systems --- and if we come across with a bigalloc file
4596 * system with zero in s_overhead_clusters the estimate will be close to
4597 * correct especially for very large cluster sizes --- but for newer
4598 * file systems, it's better to calculate this figure once at mkfs
4599 * time, and store it in the superblock. If the superblock value is
4600 * present (even for non-bigalloc file systems), we will use it.
4601 */
4509static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) 4602static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4510{ 4603{
4511 struct super_block *sb = dentry->d_sb; 4604 struct super_block *sb = dentry->d_sb;
4512 struct ext4_sb_info *sbi = EXT4_SB(sb); 4605 struct ext4_sb_info *sbi = EXT4_SB(sb);
4513 struct ext4_super_block *es = sbi->s_es; 4606 struct ext4_super_block *es = sbi->s_es;
4607 struct ext4_group_desc *gdp;
4514 u64 fsid; 4608 u64 fsid;
4515 s64 bfree; 4609 s64 bfree;
4516 4610
4517 if (test_opt(sb, MINIX_DF)) { 4611 if (test_opt(sb, MINIX_DF)) {
4518 sbi->s_overhead_last = 0; 4612 sbi->s_overhead_last = 0;
4613 } else if (es->s_overhead_clusters) {
4614 sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
4519 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 4615 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
4520 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4616 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4521 ext4_fsblk_t overhead = 0; 4617 ext4_fsblk_t overhead = 0;
@@ -4530,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4530 * All of the blocks before first_data_block are 4626 * All of the blocks before first_data_block are
4531 * overhead 4627 * overhead
4532 */ 4628 */
4533 overhead = le32_to_cpu(es->s_first_data_block); 4629 overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
4534 4630
4535 /* 4631 /*
4536 * Add the overhead attributed to the superblock and 4632 * Add the overhead found in each block group
4537 * block group descriptors. If the sparse superblocks
4538 * feature is turned on, then not all groups have this.
4539 */ 4633 */
4540 for (i = 0; i < ngroups; i++) { 4634 for (i = 0; i < ngroups; i++) {
4541 overhead += ext4_bg_has_super(sb, i) + 4635 gdp = ext4_get_group_desc(sb, i, NULL);
4542 ext4_bg_num_gdb(sb, i); 4636 overhead += ext4_num_overhead_clusters(sb, i, gdp);
4543 cond_resched(); 4637 cond_resched();
4544 } 4638 }
4545
4546 /*
4547 * Every block group has an inode bitmap, a block
4548 * bitmap, and an inode table.
4549 */
4550 overhead += ngroups * (2 + sbi->s_itb_per_group);
4551 sbi->s_overhead_last = overhead; 4639 sbi->s_overhead_last = overhead;
4552 smp_wmb(); 4640 smp_wmb();
4553 sbi->s_blocks_last = ext4_blocks_count(es); 4641 sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4555,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4555 4643
4556 buf->f_type = EXT4_SUPER_MAGIC; 4644 buf->f_type = EXT4_SUPER_MAGIC;
4557 buf->f_bsize = sb->s_blocksize; 4645 buf->f_bsize = sb->s_blocksize;
4558 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4646 buf->f_blocks = (ext4_blocks_count(es) -
4559 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4647 EXT4_C2B(sbi, sbi->s_overhead_last));
4560 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4648 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
4649 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4561 /* prevent underflow in case that few free space is available */ 4650 /* prevent underflow in case that few free space is available */
4562 buf->f_bfree = max_t(s64, bfree, 0); 4651 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
4563 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4652 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4564 if (buf->f_bfree < ext4_r_blocks_count(es)) 4653 if (buf->f_bfree < ext4_r_blocks_count(es))
4565 buf->f_bavail = 0; 4654 buf->f_bavail = 0;
@@ -4980,13 +5069,11 @@ static int __init ext4_init_fs(void)
4980 return err; 5069 return err;
4981 err = ext4_init_system_zone(); 5070 err = ext4_init_system_zone();
4982 if (err) 5071 if (err)
4983 goto out7; 5072 goto out6;
4984 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 5073 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4985 if (!ext4_kset) 5074 if (!ext4_kset)
4986 goto out6;
4987 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4988 if (!ext4_proc_root)
4989 goto out5; 5075 goto out5;
5076 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4990 5077
4991 err = ext4_init_feat_adverts(); 5078 err = ext4_init_feat_adverts();
4992 if (err) 5079 if (err)
@@ -5022,12 +5109,12 @@ out2:
5022out3: 5109out3:
5023 ext4_exit_feat_adverts(); 5110 ext4_exit_feat_adverts();
5024out4: 5111out4:
5025 remove_proc_entry("fs/ext4", NULL); 5112 if (ext4_proc_root)
5026out5: 5113 remove_proc_entry("fs/ext4", NULL);
5027 kset_unregister(ext4_kset); 5114 kset_unregister(ext4_kset);
5028out6: 5115out5:
5029 ext4_exit_system_zone(); 5116 ext4_exit_system_zone();
5030out7: 5117out6:
5031 ext4_exit_pageio(); 5118 ext4_exit_pageio();
5032 return err; 5119 return err;
5033} 5120}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc9725..93a00d89a22 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,14 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 /*
824 * take i_data_sem because we will test
825 * i_delalloc_reserved_flag in ext4_mb_new_blocks
826 */
827 down_read((&EXT4_I(inode)->i_data_sem));
823 block = ext4_new_meta_blocks(handle, inode, goal, 0, 828 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 NULL, &error); 829 NULL, &error);
830 up_read((&EXT4_I(inode)->i_data_sem));
825 if (error) 831 if (error)
826 goto cleanup; 832 goto cleanup;
827 833
@@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
985 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); 991 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
986 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); 992 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
987 993
988 error = ext4_get_inode_loc(inode, &is.iloc); 994 error = ext4_reserve_inode_write(handle, inode, &is.iloc);
989 if (error)
990 goto cleanup;
991
992 error = ext4_journal_get_write_access(handle, is.iloc.bh);
993 if (error) 995 if (error)
994 goto cleanup; 996 goto cleanup;
995 997
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 5efbd5d7701..aca191bd5f8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
156 } else { 156 } else {
157 if (uni_xlate == 1) { 157 if (uni_xlate == 1) {
158 *op++ = ':'; 158 *op++ = ':';
159 op = pack_hex_byte(op, ec >> 8); 159 op = hex_byte_pack(op, ec >> 8);
160 op = pack_hex_byte(op, ec); 160 op = hex_byte_pack(op, ec);
161 len -= 5; 161 len -= 5;
162 } else { 162 } else {
163 *op++ = '?'; 163 *op++ = '?';
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a5d3853822e..1510a4d5199 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
326extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 326extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
327 struct inode *i2); 327 struct inode *i2);
328/* fat/misc.c */ 328/* fat/misc.c */
329extern void 329extern __printf(3, 4) __cold
330__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) 330void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
331 __attribute__ ((format (printf, 3, 4))) __cold;
332#define fat_fs_error(sb, fmt, args...) \ 331#define fat_fs_error(sb, fmt, args...) \
333 __fat_fs_error(sb, 1, fmt , ## args) 332 __fat_fs_error(sb, 1, fmt , ## args)
334#define fat_fs_error_ratelimit(sb, fmt, args...) \ 333#define fat_fs_error_ratelimit(sb, fmt, args...) \
335 __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) 334 __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
336void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) 335__printf(3, 4) __cold
337 __attribute__ ((format (printf, 3, 4))) __cold; 336void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
338extern int fat_clusters_flush(struct super_block *sb); 337extern int fat_clusters_flush(struct super_block *sb);
339extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 338extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
340extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 339extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1726d730304..808cac7edcf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
379 return error; 379 return error;
380 MSDOS_I(inode)->mmu_private = inode->i_size; 380 MSDOS_I(inode)->mmu_private = inode->i_size;
381 381
382 inode->i_nlink = fat_subdirs(inode); 382 set_nlink(inode, fat_subdirs(inode));
383 } else { /* not a directory */ 383 } else { /* not a directory */
384 inode->i_generation |= 1; 384 inode->i_generation |= 1;
385 inode->i_mode = fat_make_mode(sbi, de->attr, 385 inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode)
1233 fat_save_attrs(inode, ATTR_DIR); 1233 fat_save_attrs(inode, ATTR_DIR);
1234 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; 1234 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
1235 inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; 1235 inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
1236 inode->i_nlink = fat_subdirs(inode)+2; 1236 set_nlink(inode, fat_subdirs(inode)+2);
1237 1237
1238 return 0; 1238 return 0;
1239} 1239}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 66e83b84545..216b419f30e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
387 /* the directory was completed, just return a error */ 387 /* the directory was completed, just return a error */
388 goto out; 388 goto out;
389 } 389 }
390 inode->i_nlink = 2; 390 set_nlink(inode, 2);
391 inode->i_mtime = inode->i_atime = inode->i_ctime = ts; 391 inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
392 /* timestamp is already written, so mark_inode_dirty() is unneeded. */ 392 /* timestamp is already written, so mark_inode_dirty() is unneeded. */
393 393
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bb3f29c3557..a87a65663c2 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
900 goto out; 900 goto out;
901 } 901 }
902 inode->i_version++; 902 inode->i_version++;
903 inode->i_nlink = 2; 903 set_nlink(inode, 2);
904 inode->i_mtime = inode->i_atime = inode->i_ctime = ts; 904 inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
905 /* timestamp is already written, so mark_inode_dirty() is unneeded. */ 905 /* timestamp is already written, so mark_inode_dirty() is unneeded. */
906 906
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1a4311437a8..7b2af5abe2f 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
227 ip->i_uid = (uid_t)vip->vii_uid; 227 ip->i_uid = (uid_t)vip->vii_uid;
228 ip->i_gid = (gid_t)vip->vii_gid; 228 ip->i_gid = (gid_t)vip->vii_gid;
229 229
230 ip->i_nlink = vip->vii_nlink; 230 set_nlink(ip, vip->vii_nlink);
231 ip->i_size = vip->vii_size; 231 ip->i_size = vip->vii_size;
232 232
233 ip->i_atime.tv_sec = vip->vii_atime; 233 ip->i_atime.tv_sec = vip->vii_atime;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e50..73c3992b2bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
41 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
42 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
43 unsigned int for_background:1; 43 unsigned int for_background:1;
44 enum wb_reason reason; /* why was writeback initiated? */
44 45
45 struct list_head list; /* pending work list */ 46 struct list_head list; /* pending work list */
46 struct completion *done; /* set if the caller waits */ 47 struct completion *done; /* set if the caller waits */
47}; 48};
48 49
50const char *wb_reason_name[] = {
51 [WB_REASON_BACKGROUND] = "background",
52 [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages",
53 [WB_REASON_SYNC] = "sync",
54 [WB_REASON_PERIODIC] = "periodic",
55 [WB_REASON_LAPTOP_TIMER] = "laptop_timer",
56 [WB_REASON_FREE_MORE_MEM] = "free_more_memory",
57 [WB_REASON_FS_FREE_SPACE] = "fs_free_space",
58 [WB_REASON_FORKER_THREAD] = "forker_thread"
59};
60
49/* 61/*
50 * Include the creation of the trace points after defining the 62 * Include the creation of the trace points after defining the
51 * wb_writeback_work structure so that the definition remains local to this 63 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
115 127
116static void 128static void
117__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 129__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
118 bool range_cyclic) 130 bool range_cyclic, enum wb_reason reason)
119{ 131{
120 struct wb_writeback_work *work; 132 struct wb_writeback_work *work;
121 133
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
135 work->sync_mode = WB_SYNC_NONE; 147 work->sync_mode = WB_SYNC_NONE;
136 work->nr_pages = nr_pages; 148 work->nr_pages = nr_pages;
137 work->range_cyclic = range_cyclic; 149 work->range_cyclic = range_cyclic;
150 work->reason = reason;
138 151
139 bdi_queue_work(bdi, work); 152 bdi_queue_work(bdi, work);
140} 153}
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
150 * completion. Caller need not hold sb s_umount semaphore. 163 * completion. Caller need not hold sb s_umount semaphore.
151 * 164 *
152 */ 165 */
153void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 166void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
167 enum wb_reason reason)
154{ 168{
155 __bdi_start_writeback(bdi, nr_pages, true); 169 __bdi_start_writeback(bdi, nr_pages, true, reason);
156} 170}
157 171
158/** 172/**
@@ -251,7 +265,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
251 */ 265 */
252static int move_expired_inodes(struct list_head *delaying_queue, 266static int move_expired_inodes(struct list_head *delaying_queue,
253 struct list_head *dispatch_queue, 267 struct list_head *dispatch_queue,
254 unsigned long *older_than_this) 268 struct wb_writeback_work *work)
255{ 269{
256 LIST_HEAD(tmp); 270 LIST_HEAD(tmp);
257 struct list_head *pos, *node; 271 struct list_head *pos, *node;
@@ -262,8 +276,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 276
263 while (!list_empty(delaying_queue)) { 277 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 278 inode = wb_inode(delaying_queue->prev);
265 if (older_than_this && 279 if (work->older_than_this &&
266 inode_dirtied_after(inode, *older_than_this)) 280 inode_dirtied_after(inode, *work->older_than_this))
267 break; 281 break;
268 if (sb && sb != inode->i_sb) 282 if (sb && sb != inode->i_sb)
269 do_sb_sort = 1; 283 do_sb_sort = 1;
@@ -302,13 +316,13 @@ out:
302 * | 316 * |
303 * +--> dequeue for IO 317 * +--> dequeue for IO
304 */ 318 */
305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 319static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
306{ 320{
307 int moved; 321 int moved;
308 assert_spin_locked(&wb->list_lock); 322 assert_spin_locked(&wb->list_lock);
309 list_splice_init(&wb->b_more_io, &wb->b_io); 323 list_splice_init(&wb->b_more_io, &wb->b_io);
310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 324 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
311 trace_writeback_queue_io(wb, older_than_this, moved); 325 trace_writeback_queue_io(wb, work, moved);
312} 326}
313 327
314static int write_inode(struct inode *inode, struct writeback_control *wbc) 328static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +655,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
641 return wrote; 655 return wrote;
642} 656}
643 657
644long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) 658long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
659 enum wb_reason reason)
645{ 660{
646 struct wb_writeback_work work = { 661 struct wb_writeback_work work = {
647 .nr_pages = nr_pages, 662 .nr_pages = nr_pages,
648 .sync_mode = WB_SYNC_NONE, 663 .sync_mode = WB_SYNC_NONE,
649 .range_cyclic = 1, 664 .range_cyclic = 1,
665 .reason = reason,
650 }; 666 };
651 667
652 spin_lock(&wb->list_lock); 668 spin_lock(&wb->list_lock);
653 if (list_empty(&wb->b_io)) 669 if (list_empty(&wb->b_io))
654 queue_io(wb, NULL); 670 queue_io(wb, &work);
655 __writeback_inodes_wb(wb, &work); 671 __writeback_inodes_wb(wb, &work);
656 spin_unlock(&wb->list_lock); 672 spin_unlock(&wb->list_lock);
657 673
658 return nr_pages - work.nr_pages; 674 return nr_pages - work.nr_pages;
659} 675}
660 676
661static inline bool over_bground_thresh(void) 677static bool over_bground_thresh(struct backing_dev_info *bdi)
662{ 678{
663 unsigned long background_thresh, dirty_thresh; 679 unsigned long background_thresh, dirty_thresh;
664 680
665 global_dirty_limits(&background_thresh, &dirty_thresh); 681 global_dirty_limits(&background_thresh, &dirty_thresh);
666 682
667 return (global_page_state(NR_FILE_DIRTY) + 683 if (global_page_state(NR_FILE_DIRTY) +
668 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 684 global_page_state(NR_UNSTABLE_NFS) > background_thresh)
685 return true;
686
687 if (bdi_stat(bdi, BDI_RECLAIMABLE) >
688 bdi_dirty_limit(bdi, background_thresh))
689 return true;
690
691 return false;
669} 692}
670 693
671/* 694/*
@@ -675,7 +698,7 @@ static inline bool over_bground_thresh(void)
675static void wb_update_bandwidth(struct bdi_writeback *wb, 698static void wb_update_bandwidth(struct bdi_writeback *wb,
676 unsigned long start_time) 699 unsigned long start_time)
677{ 700{
678 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); 701 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
679} 702}
680 703
681/* 704/*
@@ -727,7 +750,7 @@ static long wb_writeback(struct bdi_writeback *wb,
727 * For background writeout, stop when we are below the 750 * For background writeout, stop when we are below the
728 * background dirty threshold 751 * background dirty threshold
729 */ 752 */
730 if (work->for_background && !over_bground_thresh()) 753 if (work->for_background && !over_bground_thresh(wb->bdi))
731 break; 754 break;
732 755
733 if (work->for_kupdate) { 756 if (work->for_kupdate) {
@@ -738,7 +761,7 @@ static long wb_writeback(struct bdi_writeback *wb,
738 761
739 trace_writeback_start(wb->bdi, work); 762 trace_writeback_start(wb->bdi, work);
740 if (list_empty(&wb->b_io)) 763 if (list_empty(&wb->b_io))
741 queue_io(wb, work->older_than_this); 764 queue_io(wb, work);
742 if (work->sb) 765 if (work->sb)
743 progress = writeback_sb_inodes(work->sb, wb, work); 766 progress = writeback_sb_inodes(work->sb, wb, work);
744 else 767 else
@@ -811,13 +834,14 @@ static unsigned long get_nr_dirty_pages(void)
811 834
812static long wb_check_background_flush(struct bdi_writeback *wb) 835static long wb_check_background_flush(struct bdi_writeback *wb)
813{ 836{
814 if (over_bground_thresh()) { 837 if (over_bground_thresh(wb->bdi)) {
815 838
816 struct wb_writeback_work work = { 839 struct wb_writeback_work work = {
817 .nr_pages = LONG_MAX, 840 .nr_pages = LONG_MAX,
818 .sync_mode = WB_SYNC_NONE, 841 .sync_mode = WB_SYNC_NONE,
819 .for_background = 1, 842 .for_background = 1,
820 .range_cyclic = 1, 843 .range_cyclic = 1,
844 .reason = WB_REASON_BACKGROUND,
821 }; 845 };
822 846
823 return wb_writeback(wb, &work); 847 return wb_writeback(wb, &work);
@@ -851,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
851 .sync_mode = WB_SYNC_NONE, 875 .sync_mode = WB_SYNC_NONE,
852 .for_kupdate = 1, 876 .for_kupdate = 1,
853 .range_cyclic = 1, 877 .range_cyclic = 1,
878 .reason = WB_REASON_PERIODIC,
854 }; 879 };
855 880
856 return wb_writeback(wb, &work); 881 return wb_writeback(wb, &work);
@@ -969,7 +994,7 @@ int bdi_writeback_thread(void *data)
969 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 994 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
970 * the whole world. 995 * the whole world.
971 */ 996 */
972void wakeup_flusher_threads(long nr_pages) 997void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
973{ 998{
974 struct backing_dev_info *bdi; 999 struct backing_dev_info *bdi;
975 1000
@@ -982,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
982 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1007 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
983 if (!bdi_has_dirty_io(bdi)) 1008 if (!bdi_has_dirty_io(bdi))
984 continue; 1009 continue;
985 __bdi_start_writeback(bdi, nr_pages, false); 1010 __bdi_start_writeback(bdi, nr_pages, false, reason);
986 } 1011 }
987 rcu_read_unlock(); 1012 rcu_read_unlock();
988} 1013}
@@ -1203,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
1203 * on how many (if any) will be written, and this function does not wait 1228 * on how many (if any) will be written, and this function does not wait
1204 * for IO completion of submitted IO. 1229 * for IO completion of submitted IO.
1205 */ 1230 */
1206void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1231void writeback_inodes_sb_nr(struct super_block *sb,
1232 unsigned long nr,
1233 enum wb_reason reason)
1207{ 1234{
1208 DECLARE_COMPLETION_ONSTACK(done); 1235 DECLARE_COMPLETION_ONSTACK(done);
1209 struct wb_writeback_work work = { 1236 struct wb_writeback_work work = {
@@ -1212,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1212 .tagged_writepages = 1, 1239 .tagged_writepages = 1,
1213 .done = &done, 1240 .done = &done,
1214 .nr_pages = nr, 1241 .nr_pages = nr,
1242 .reason = reason,
1215 }; 1243 };
1216 1244
1217 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1245 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1228,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1228 * on how many (if any) will be written, and this function does not wait 1256 * on how many (if any) will be written, and this function does not wait
1229 * for IO completion of submitted IO. 1257 * for IO completion of submitted IO.
1230 */ 1258 */
1231void writeback_inodes_sb(struct super_block *sb) 1259void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1232{ 1260{
1233 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1261 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1234} 1262}
1235EXPORT_SYMBOL(writeback_inodes_sb); 1263EXPORT_SYMBOL(writeback_inodes_sb);
1236 1264
@@ -1241,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1241 * Invoke writeback_inodes_sb if no writeback is currently underway. 1269 * Invoke writeback_inodes_sb if no writeback is currently underway.
1242 * Returns 1 if writeback was started, 0 if not. 1270 * Returns 1 if writeback was started, 0 if not.
1243 */ 1271 */
1244int writeback_inodes_sb_if_idle(struct super_block *sb) 1272int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
1245{ 1273{
1246 if (!writeback_in_progress(sb->s_bdi)) { 1274 if (!writeback_in_progress(sb->s_bdi)) {
1247 down_read(&sb->s_umount); 1275 down_read(&sb->s_umount);
1248 writeback_inodes_sb(sb); 1276 writeback_inodes_sb(sb, reason);
1249 up_read(&sb->s_umount); 1277 up_read(&sb->s_umount);
1250 return 1; 1278 return 1;
1251 } else 1279 } else
@@ -1262,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1262 * Returns 1 if writeback was started, 0 if not. 1290 * Returns 1 if writeback was started, 0 if not.
1263 */ 1291 */
1264int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1292int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1265 unsigned long nr) 1293 unsigned long nr,
1294 enum wb_reason reason)
1266{ 1295{
1267 if (!writeback_in_progress(sb->s_bdi)) { 1296 if (!writeback_in_progress(sb->s_bdi)) {
1268 down_read(&sb->s_umount); 1297 down_read(&sb->s_umount);
1269 writeback_inodes_sb_nr(sb, nr); 1298 writeback_inodes_sb_nr(sb, nr, reason);
1270 up_read(&sb->s_umount); 1299 up_read(&sb->s_umount);
1271 return 1; 1300 return 1;
1272 } else 1301 } else
@@ -1290,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
1290 .nr_pages = LONG_MAX, 1319 .nr_pages = LONG_MAX,
1291 .range_cyclic = 0, 1320 .range_cyclic = 0,
1292 .done = &done, 1321 .done = &done,
1322 .reason = WB_REASON_SYNC,
1293 }; 1323 };
1294 1324
1295 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1325 WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 85542a7daf4..42593c587d4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
231 if (iop) 231 if (iop)
232 inode->i_op = iop; 232 inode->i_op = iop;
233 inode->i_fop = fop; 233 inode->i_fop = fop;
234 inode->i_nlink = nlink; 234 set_nlink(inode, nlink);
235 inode->i_private = fc; 235 inode->i_private = fc;
236 d_add(dentry, inode); 236 d_add(dentry, inode);
237 return dentry; 237 return dentry;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b0..3426521f320 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/spinlock.h> 48#include <linux/spinlock.h>
49#include <linux/stat.h> 49#include <linux/stat.h>
50#include <linux/module.h>
50 51
51#include "fuse_i.h" 52#include "fuse_i.h"
52 53
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index add96f6ffda..3e6d7275647 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
151 151
152 inode->i_ino = attr->ino; 152 inode->i_ino = attr->ino;
153 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 153 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
154 inode->i_nlink = attr->nlink; 154 set_nlink(inode, attr->nlink);
155 inode->i_uid = attr->uid; 155 inode->i_uid = attr->uid;
156 inode->i_gid = attr->gid; 156 inode->i_gid = attr->gid;
157 inode->i_blocks = attr->blocks; 157 inode->i_blocks = attr->blocks;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc4..65978d7885c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
82 iattr.ia_valid = ATTR_MODE; 82 iattr.ia_valid = ATTR_MODE;
83 iattr.ia_mode = mode; 83 iattr.ia_mode = mode;
84 84
85 error = gfs2_setattr_simple(GFS2_I(inode), &iattr); 85 error = gfs2_setattr_simple(inode, &iattr);
86 } 86 }
87 87
88 return error; 88 return error;
@@ -160,6 +160,7 @@ out:
160 160
161int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) 161int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
162{ 162{
163 struct inode *inode = &ip->i_inode;
163 struct posix_acl *acl; 164 struct posix_acl *acl;
164 char *data; 165 char *data;
165 unsigned int len; 166 unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
169 if (IS_ERR(acl)) 170 if (IS_ERR(acl))
170 return PTR_ERR(acl); 171 return PTR_ERR(acl);
171 if (!acl) 172 if (!acl)
172 return gfs2_setattr_simple(ip, attr); 173 return gfs2_setattr_simple(inode, attr);
173 174
174 error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); 175 error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
175 if (error) 176 if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c22..4858e1fed8b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 if (&ip->i_inode == sdp->sd_rindex) 663 if (&ip->i_inode == sdp->sd_rindex)
664 rblocks += 2 * RES_STATFS; 664 rblocks += 2 * RES_STATFS;
665 if (alloc_required) 665 if (alloc_required)
666 rblocks += gfs2_rg_blocks(al); 666 rblocks += gfs2_rg_blocks(ip);
667 667
668 error = gfs2_trans_begin(sdp, rblocks, 668 error = gfs2_trans_begin(sdp, rblocks,
669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
787 u64 to = pos + copied; 787 u64 to = pos + copied;
788 void *kaddr; 788 void *kaddr;
789 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); 789 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
790 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
791 790
792 BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); 791 BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
793 kaddr = kmap_atomic(page, KM_USER0); 792 kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
804 if (copied) { 803 if (copied) {
805 if (inode->i_size < to) 804 if (inode->i_size < to)
806 i_size_write(inode, to); 805 i_size_write(inode, to);
807 gfs2_dinode_out(ip, di);
808 mark_inode_dirty(inode); 806 mark_inode_dirty(inode);
809 } 807 }
810 808
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
873 gfs2_page_add_databufs(ip, page, from, to); 871 gfs2_page_add_databufs(ip, page, from, to);
874 872
875 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 873 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
876 if (ret > 0) {
877 gfs2_dinode_out(ip, dibh->b_data);
878 mark_inode_dirty(inode);
879 }
880 874
881 if (inode == sdp->sd_rindex) { 875 if (inode == sdp->sd_rindex) {
882 adjust_fs_space(inode); 876 adjust_fs_space(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae6..41d494d7970 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/completion.h> 11#include <linux/completion.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/blkdev.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14#include <linux/crc32.h> 15#include <linux/crc32.h>
15 16
@@ -36,11 +37,6 @@ struct metapath {
36 __u16 mp_list[GFS2_MAX_META_HEIGHT]; 37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
37}; 38};
38 39
39typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
40 struct buffer_head *bh, __be64 *top,
41 __be64 *bottom, unsigned int height,
42 void *data);
43
44struct strip_mine { 40struct strip_mine {
45 int sm_first; 41 int sm_first;
46 unsigned int sm_height; 42 unsigned int sm_height;
@@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
273 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; 269 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
274} 270}
275 271
272static void gfs2_metapath_ra(struct gfs2_glock *gl,
273 const struct buffer_head *bh, const __be64 *pos)
274{
275 struct buffer_head *rabh;
276 const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
277 const __be64 *t;
278
279 for (t = pos; t < endp; t++) {
280 if (!*t)
281 continue;
282
283 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
284 if (trylock_buffer(rabh)) {
285 if (!buffer_uptodate(rabh)) {
286 rabh->b_end_io = end_buffer_read_sync;
287 submit_bh(READA | REQ_META, rabh);
288 continue;
289 }
290 unlock_buffer(rabh);
291 }
292 brelse(rabh);
293 }
294}
295
276/** 296/**
277 * lookup_metapath - Walk the metadata tree to a specific point 297 * lookup_metapath - Walk the metadata tree to a specific point
278 * @ip: The inode 298 * @ip: The inode
@@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
432{ 452{
433 struct gfs2_inode *ip = GFS2_I(inode); 453 struct gfs2_inode *ip = GFS2_I(inode);
434 struct gfs2_sbd *sdp = GFS2_SB(inode); 454 struct gfs2_sbd *sdp = GFS2_SB(inode);
455 struct super_block *sb = sdp->sd_vfs;
435 struct buffer_head *dibh = mp->mp_bh[0]; 456 struct buffer_head *dibh = mp->mp_bh[0];
436 u64 bn, dblock = 0; 457 u64 bn, dblock = 0;
437 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; 458 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
438 unsigned dblks = 0; 459 unsigned dblks = 0;
439 unsigned ptrs_per_blk; 460 unsigned ptrs_per_blk;
440 const unsigned end_of_metadata = height - 1; 461 const unsigned end_of_metadata = height - 1;
462 int ret;
441 int eob = 0; 463 int eob = 0;
442 enum alloc_state state; 464 enum alloc_state state;
443 __be64 *ptr; 465 __be64 *ptr;
@@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
540 dblock = bn; 562 dblock = bn;
541 while (n-- > 0) 563 while (n-- > 0)
542 *ptr++ = cpu_to_be64(bn++); 564 *ptr++ = cpu_to_be64(bn++);
565 if (buffer_zeronew(bh_map)) {
566 ret = sb_issue_zeroout(sb, dblock, dblks,
567 GFP_NOFS);
568 if (ret) {
569 fs_err(sdp,
570 "Failed to zero data buffers\n");
571 clear_buffer_zeronew(bh_map);
572 }
573 }
543 break; 574 break;
544 } 575 }
545 } while ((state != ALLOC_DATA) || !dblock); 576 } while ((state != ALLOC_DATA) || !dblock);
@@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
668} 699}
669 700
670/** 701/**
671 * recursive_scan - recursively scan through the end of a file
672 * @ip: the inode
673 * @dibh: the dinode buffer
674 * @mp: the path through the metadata to the point to start
675 * @height: the height the recursion is at
676 * @block: the indirect block to look at
677 * @first: 1 if this is the first block
678 * @bc: the call to make for each piece of metadata
679 * @data: data opaque to this function to pass to @bc
680 *
681 * When this is first called @height and @block should be zero and
682 * @first should be 1.
683 *
684 * Returns: errno
685 */
686
687static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
688 struct metapath *mp, unsigned int height,
689 u64 block, int first, block_call_t bc,
690 void *data)
691{
692 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
693 struct buffer_head *bh = NULL;
694 __be64 *top, *bottom;
695 u64 bn;
696 int error;
697 int mh_size = sizeof(struct gfs2_meta_header);
698
699 if (!height) {
700 error = gfs2_meta_inode_buffer(ip, &bh);
701 if (error)
702 return error;
703 dibh = bh;
704
705 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
706 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
707 } else {
708 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
709 if (error)
710 return error;
711
712 top = (__be64 *)(bh->b_data + mh_size) +
713 (first ? mp->mp_list[height] : 0);
714
715 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
716 }
717
718 error = bc(ip, dibh, bh, top, bottom, height, data);
719 if (error)
720 goto out;
721
722 if (height < ip->i_height - 1)
723 for (; top < bottom; top++, first = 0) {
724 if (!*top)
725 continue;
726
727 bn = be64_to_cpu(*top);
728
729 error = recursive_scan(ip, dibh, mp, height + 1, bn,
730 first, bc, data);
731 if (error)
732 break;
733 }
734
735out:
736 brelse(bh);
737 return error;
738}
739
740/**
741 * do_strip - Look for a layer a particular layer of the file and strip it off 702 * do_strip - Look for a layer a particular layer of the file and strip it off
742 * @ip: the inode 703 * @ip: the inode
743 * @dibh: the dinode buffer 704 * @dibh: the dinode buffer
@@ -752,9 +713,8 @@ out:
752 713
753static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, 714static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
754 struct buffer_head *bh, __be64 *top, __be64 *bottom, 715 struct buffer_head *bh, __be64 *top, __be64 *bottom,
755 unsigned int height, void *data) 716 unsigned int height, struct strip_mine *sm)
756{ 717{
757 struct strip_mine *sm = data;
758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 718 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
759 struct gfs2_rgrp_list rlist; 719 struct gfs2_rgrp_list rlist;
760 u64 bn, bstart; 720 u64 bn, bstart;
@@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
783 else if (ip->i_depth) 743 else if (ip->i_depth)
784 revokes = sdp->sd_inptrs; 744 revokes = sdp->sd_inptrs;
785 745
786 if (ip != GFS2_I(sdp->sd_rindex))
787 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
788 else if (!sdp->sd_rgrps)
789 error = gfs2_ri_update(ip);
790
791 if (error) 746 if (error)
792 return error; 747 return error;
793 748
@@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
805 blen++; 760 blen++;
806 else { 761 else {
807 if (bstart) 762 if (bstart)
808 gfs2_rlist_add(sdp, &rlist, bstart); 763 gfs2_rlist_add(ip, &rlist, bstart);
809 764
810 bstart = bn; 765 bstart = bn;
811 blen = 1; 766 blen = 1;
@@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
813 } 768 }
814 769
815 if (bstart) 770 if (bstart)
816 gfs2_rlist_add(sdp, &rlist, bstart); 771 gfs2_rlist_add(ip, &rlist, bstart);
817 else 772 else
818 goto out; /* Nothing to do */ 773 goto out; /* Nothing to do */
819 774
@@ -887,12 +842,82 @@ out_rg_gunlock:
887out_rlist: 842out_rlist:
888 gfs2_rlist_free(&rlist); 843 gfs2_rlist_free(&rlist);
889out: 844out:
890 if (ip != GFS2_I(sdp->sd_rindex))
891 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
892 return error; 845 return error;
893} 846}
894 847
895/** 848/**
849 * recursive_scan - recursively scan through the end of a file
850 * @ip: the inode
851 * @dibh: the dinode buffer
852 * @mp: the path through the metadata to the point to start
853 * @height: the height the recursion is at
854 * @block: the indirect block to look at
855 * @first: 1 if this is the first block
856 * @sm: data opaque to this function to pass to @bc
857 *
858 * When this is first called @height and @block should be zero and
859 * @first should be 1.
860 *
861 * Returns: errno
862 */
863
864static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
865 struct metapath *mp, unsigned int height,
866 u64 block, int first, struct strip_mine *sm)
867{
868 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
869 struct buffer_head *bh = NULL;
870 __be64 *top, *bottom;
871 u64 bn;
872 int error;
873 int mh_size = sizeof(struct gfs2_meta_header);
874
875 if (!height) {
876 error = gfs2_meta_inode_buffer(ip, &bh);
877 if (error)
878 return error;
879 dibh = bh;
880
881 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
882 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
883 } else {
884 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
885 if (error)
886 return error;
887
888 top = (__be64 *)(bh->b_data + mh_size) +
889 (first ? mp->mp_list[height] : 0);
890
891 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
892 }
893
894 error = do_strip(ip, dibh, bh, top, bottom, height, sm);
895 if (error)
896 goto out;
897
898 if (height < ip->i_height - 1) {
899
900 gfs2_metapath_ra(ip->i_gl, bh, top);
901
902 for (; top < bottom; top++, first = 0) {
903 if (!*top)
904 continue;
905
906 bn = be64_to_cpu(*top);
907
908 error = recursive_scan(ip, dibh, mp, height + 1, bn,
909 first, sm);
910 if (error)
911 break;
912 }
913 }
914out:
915 brelse(bh);
916 return error;
917}
918
919
920/**
896 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 921 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
897 * 922 *
898 * This is partly borrowed from ext3. 923 * This is partly borrowed from ext3.
@@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
1031 sm.sm_first = !!size; 1056 sm.sm_first = !!size;
1032 sm.sm_height = height; 1057 sm.sm_height = height;
1033 1058
1034 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); 1059 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
1035 if (error) 1060 if (error)
1036 break; 1061 break;
1037 } 1062 }
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a..8ccad2467cb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
240 return error; 240 return error;
241} 241}
242 242
243static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, 243static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
244 u64 offset, unsigned int size) 244 unsigned int size)
245{ 245{
246 struct buffer_head *dibh; 246 struct buffer_head *dibh;
247 int error; 247 int error;
248 248
249 error = gfs2_meta_inode_buffer(ip, &dibh); 249 error = gfs2_meta_inode_buffer(ip, &dibh);
250 if (!error) { 250 if (!error) {
251 offset += sizeof(struct gfs2_dinode); 251 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
252 memcpy(buf, dibh->b_data + offset, size);
253 brelse(dibh); 252 brelse(dibh);
254 } 253 }
255 254
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
261 * gfs2_dir_read_data - Read a data from a directory inode 260 * gfs2_dir_read_data - Read a data from a directory inode
262 * @ip: The GFS2 Inode 261 * @ip: The GFS2 Inode
263 * @buf: The buffer to place result into 262 * @buf: The buffer to place result into
264 * @offset: File offset to begin jdata_readng from
265 * @size: Amount of data to transfer 263 * @size: Amount of data to transfer
266 * 264 *
267 * Returns: The amount of data actually copied or the error 265 * Returns: The amount of data actually copied or the error
268 */ 266 */
269static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, 267static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
270 unsigned int size, unsigned ra) 268 unsigned int size)
271{ 269{
272 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 270 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
273 u64 lblock, dblock; 271 u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 273 unsigned int o;
276 int copied = 0; 274 int copied = 0;
277 int error = 0; 275 int error = 0;
278 u64 disksize = i_size_read(&ip->i_inode);
279
280 if (offset >= disksize)
281 return 0;
282
283 if (offset + size > disksize)
284 size = disksize - offset;
285
286 if (!size)
287 return 0;
288 276
289 if (gfs2_is_stuffed(ip)) 277 if (gfs2_is_stuffed(ip))
290 return gfs2_dir_read_stuffed(ip, buf, offset, size); 278 return gfs2_dir_read_stuffed(ip, buf, size);
291 279
292 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) 280 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
293 return -EINVAL; 281 return -EINVAL;
294 282
295 lblock = offset; 283 lblock = 0;
296 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); 284 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
297 285
298 while (copied < size) { 286 while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
311 if (error || !dblock) 299 if (error || !dblock)
312 goto fail; 300 goto fail;
313 BUG_ON(extlen < 1); 301 BUG_ON(extlen < 1);
314 if (!ra)
315 extlen = 1;
316 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); 302 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
317 } else { 303 } else {
318 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); 304 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
328 extlen--; 314 extlen--;
329 memcpy(buf, bh->b_data + o, amount); 315 memcpy(buf, bh->b_data + o, amount);
330 brelse(bh); 316 brelse(bh);
331 buf += amount; 317 buf += (amount/sizeof(__be64));
332 copied += amount; 318 copied += amount;
333 lblock++; 319 lblock++;
334 o = sizeof(struct gfs2_meta_header); 320 o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
371 if (hc == NULL) 357 if (hc == NULL)
372 return ERR_PTR(-ENOMEM); 358 return ERR_PTR(-ENOMEM);
373 359
374 ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); 360 ret = gfs2_dir_read_data(ip, hc, hsize);
375 if (ret < 0) { 361 if (ret < 0) {
376 kfree(hc); 362 kfree(hc);
377 return ERR_PTR(ret); 363 return ERR_PTR(ret);
@@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1695 const struct qstr *name = &dentry->d_name; 1681 const struct qstr *name = &dentry->d_name;
1696 struct gfs2_dirent *dent, *prev = NULL; 1682 struct gfs2_dirent *dent, *prev = NULL;
1697 struct buffer_head *bh; 1683 struct buffer_head *bh;
1698 int error;
1699 1684
1700 /* Returns _either_ the entry (if its first in block) or the 1685 /* Returns _either_ the entry (if its first in block) or the
1701 previous entry otherwise */ 1686 previous entry otherwise */
@@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1724 } 1709 }
1725 brelse(bh); 1710 brelse(bh);
1726 1711
1727 error = gfs2_meta_inode_buffer(dip, &bh);
1728 if (error)
1729 return error;
1730
1731 if (!dip->i_entries) 1712 if (!dip->i_entries)
1732 gfs2_consist_inode(dip); 1713 gfs2_consist_inode(dip);
1733 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1734 dip->i_entries--; 1714 dip->i_entries--;
1735 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1715 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1736 if (S_ISDIR(dentry->d_inode->i_mode)) 1716 if (S_ISDIR(dentry->d_inode->i_mode))
1737 drop_nlink(&dip->i_inode); 1717 drop_nlink(&dip->i_inode);
1738 gfs2_dinode_out(dip, bh->b_data);
1739 brelse(bh);
1740 mark_inode_dirty(&dip->i_inode); 1718 mark_inode_dirty(&dip->i_inode);
1741 1719
1742 return error; 1720 return 0;
1743} 1721}
1744 1722
1745/** 1723/**
@@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1829 if (error) 1807 if (error)
1830 goto out_put; 1808 goto out_put;
1831 1809
1832 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1833 if (error)
1834 goto out_qs;
1835
1836 /* Count the number of leaves */ 1810 /* Count the number of leaves */
1837 bh = leaf_bh; 1811 bh = leaf_bh;
1838 1812
@@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1847 if (blk != leaf_no) 1821 if (blk != leaf_no)
1848 brelse(bh); 1822 brelse(bh);
1849 1823
1850 gfs2_rlist_add(sdp, &rlist, blk); 1824 gfs2_rlist_add(dip, &rlist, blk);
1851 l_blocks++; 1825 l_blocks++;
1852 } 1826 }
1853 1827
@@ -1911,8 +1885,6 @@ out_rg_gunlock:
1911 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); 1885 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1912out_rlist: 1886out_rlist:
1913 gfs2_rlist_free(&rlist); 1887 gfs2_rlist_free(&rlist);
1914 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1915out_qs:
1916 gfs2_quota_unhold(dip); 1888 gfs2_quota_unhold(dip);
1917out_put: 1889out_put:
1918 gfs2_alloc_put(dip); 1890 gfs2_alloc_put(dip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e80290..ce36a56dfea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
59 struct gfs2_holder i_gh; 59 struct gfs2_holder i_gh;
60 loff_t error; 60 loff_t error;
61 61
62 if (origin == 2) { 62 switch (origin) {
63 case SEEK_END: /* These reference inode->i_size */
64 case SEEK_DATA:
65 case SEEK_HOLE:
63 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 66 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
64 &i_gh); 67 &i_gh);
65 if (!error) { 68 if (!error) {
66 error = generic_file_llseek_unlocked(file, offset, origin); 69 error = generic_file_llseek(file, offset, origin);
67 gfs2_glock_dq_uninit(&i_gh); 70 gfs2_glock_dq_uninit(&i_gh);
68 } 71 }
69 } else 72 break;
70 error = generic_file_llseek_unlocked(file, offset, origin); 73 case SEEK_CUR:
74 case SEEK_SET:
75 error = generic_file_llseek(file, offset, origin);
76 break;
77 default:
78 error = -EINVAL;
79 }
71 80
72 return error; 81 return error;
73} 82}
@@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
357 unsigned int data_blocks, ind_blocks, rblocks; 366 unsigned int data_blocks, ind_blocks, rblocks;
358 struct gfs2_holder gh; 367 struct gfs2_holder gh;
359 struct gfs2_alloc *al; 368 struct gfs2_alloc *al;
369 loff_t size;
360 int ret; 370 int ret;
361 371
372 /* Wait if fs is frozen. This is racy so we check again later on
373 * and retry if the fs has been frozen after the page lock has
374 * been acquired
375 */
376 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
377
362 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 378 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
363 ret = gfs2_glock_nq(&gh); 379 ret = gfs2_glock_nq(&gh);
364 if (ret) 380 if (ret)
@@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
367 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 383 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
368 set_bit(GIF_SW_PAGED, &ip->i_flags); 384 set_bit(GIF_SW_PAGED, &ip->i_flags);
369 385
370 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) 386 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
387 lock_page(page);
388 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
389 ret = -EAGAIN;
390 unlock_page(page);
391 }
371 goto out_unlock; 392 goto out_unlock;
393 }
394
372 ret = -ENOMEM; 395 ret = -ENOMEM;
373 al = gfs2_alloc_get(ip); 396 al = gfs2_alloc_get(ip);
374 if (al == NULL) 397 if (al == NULL)
@@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
388 rblocks += data_blocks ? data_blocks : 1; 411 rblocks += data_blocks ? data_blocks : 1;
389 if (ind_blocks || data_blocks) { 412 if (ind_blocks || data_blocks) {
390 rblocks += RES_STATFS + RES_QUOTA; 413 rblocks += RES_STATFS + RES_QUOTA;
391 rblocks += gfs2_rg_blocks(al); 414 rblocks += gfs2_rg_blocks(ip);
392 } 415 }
393 ret = gfs2_trans_begin(sdp, rblocks, 0); 416 ret = gfs2_trans_begin(sdp, rblocks, 0);
394 if (ret) 417 if (ret)
@@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
396 419
397 lock_page(page); 420 lock_page(page);
398 ret = -EINVAL; 421 ret = -EINVAL;
399 last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; 422 size = i_size_read(inode);
400 if (page->index > last_index) 423 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
401 goto out_unlock_page; 424 /* Check page index against inode size */
425 if (size == 0 || (page->index > last_index))
426 goto out_trans_end;
427
428 ret = -EAGAIN;
429 /* If truncated, we must retry the operation, we may have raced
430 * with the glock demotion code.
431 */
432 if (!PageUptodate(page) || page->mapping != inode->i_mapping)
433 goto out_trans_end;
434
435 /* Unstuff, if required, and allocate backing blocks for page */
402 ret = 0; 436 ret = 0;
403 if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) 437 if (gfs2_is_stuffed(ip))
404 goto out_unlock_page;
405 if (gfs2_is_stuffed(ip)) {
406 ret = gfs2_unstuff_dinode(ip, page); 438 ret = gfs2_unstuff_dinode(ip, page);
407 if (ret) 439 if (ret == 0)
408 goto out_unlock_page; 440 ret = gfs2_allocate_page_backing(page);
409 }
410 ret = gfs2_allocate_page_backing(page);
411 441
412out_unlock_page: 442out_trans_end:
413 unlock_page(page); 443 if (ret)
444 unlock_page(page);
414 gfs2_trans_end(sdp); 445 gfs2_trans_end(sdp);
415out_trans_fail: 446out_trans_fail:
416 gfs2_inplace_release(ip); 447 gfs2_inplace_release(ip);
@@ -422,11 +453,17 @@ out_unlock:
422 gfs2_glock_dq(&gh); 453 gfs2_glock_dq(&gh);
423out: 454out:
424 gfs2_holder_uninit(&gh); 455 gfs2_holder_uninit(&gh);
425 if (ret == -ENOMEM) 456 if (ret == 0) {
426 ret = VM_FAULT_OOM; 457 set_page_dirty(page);
427 else if (ret) 458 /* This check must be post dropping of transaction lock */
428 ret = VM_FAULT_SIGBUS; 459 if (inode->i_sb->s_frozen == SB_UNFROZEN) {
429 return ret; 460 wait_on_page_writeback(page);
461 } else {
462 ret = -EAGAIN;
463 unlock_page(page);
464 }
465 }
466 return block_page_mkwrite_return(ret);
430} 467}
431 468
432static const struct vm_operations_struct gfs2_vm_ops = { 469static const struct vm_operations_struct gfs2_vm_ops = {
@@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
551 * @end: the end position in the file to sync 588 * @end: the end position in the file to sync
552 * @datasync: set if we can ignore timestamp changes 589 * @datasync: set if we can ignore timestamp changes
553 * 590 *
554 * The VFS will flush data for us. We only need to worry 591 * We split the data flushing here so that we don't wait for the data
555 * about metadata here. 592 * until after we've also sent the metadata to disk. Note that for
593 * data=ordered, we will write & wait for the data at the log flush
594 * stage anyway, so this is unlikely to make much of a difference
595 * except in the data=writeback case.
596 *
597 * If the fdatawrite fails due to any reason except -EIO, we will
598 * continue the remainder of the fsync, although we'll still report
599 * the error at the end. This is to match filemap_write_and_wait_range()
600 * behaviour.
556 * 601 *
557 * Returns: errno 602 * Returns: errno
558 */ 603 */
@@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file)
560static int gfs2_fsync(struct file *file, loff_t start, loff_t end, 605static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
561 int datasync) 606 int datasync)
562{ 607{
563 struct inode *inode = file->f_mapping->host; 608 struct address_space *mapping = file->f_mapping;
609 struct inode *inode = mapping->host;
564 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 610 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
565 struct gfs2_inode *ip = GFS2_I(inode); 611 struct gfs2_inode *ip = GFS2_I(inode);
566 int ret; 612 int ret, ret1 = 0;
567 613
568 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 614 if (mapping->nrpages) {
569 if (ret) 615 ret1 = filemap_fdatawrite_range(mapping, start, end);
570 return ret; 616 if (ret1 == -EIO)
571 mutex_lock(&inode->i_mutex); 617 return ret1;
618 }
572 619
573 if (datasync) 620 if (datasync)
574 sync_state &= ~I_DIRTY_SYNC; 621 sync_state &= ~I_DIRTY_SYNC;
575 622
576 if (sync_state) { 623 if (sync_state) {
577 ret = sync_inode_metadata(inode, 1); 624 ret = sync_inode_metadata(inode, 1);
578 if (ret) { 625 if (ret)
579 mutex_unlock(&inode->i_mutex);
580 return ret; 626 return ret;
581 } 627 if (gfs2_is_jdata(ip))
582 gfs2_ail_flush(ip->i_gl); 628 filemap_write_and_wait(mapping);
629 gfs2_ail_flush(ip->i_gl, 1);
583 } 630 }
584 631
585 mutex_unlock(&inode->i_mutex); 632 if (mapping->nrpages)
586 return 0; 633 ret = filemap_fdatawait_range(mapping, start, end);
634
635 return ret ? ret : ret1;
587} 636}
588 637
589/** 638/**
@@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
620 return generic_file_aio_write(iocb, iov, nr_segs, pos); 669 return generic_file_aio_write(iocb, iov, nr_segs, pos);
621} 670}
622 671
623static int empty_write_end(struct page *page, unsigned from,
624 unsigned to, int mode)
625{
626 struct inode *inode = page->mapping->host;
627 struct gfs2_inode *ip = GFS2_I(inode);
628 struct buffer_head *bh;
629 unsigned offset, blksize = 1 << inode->i_blkbits;
630 pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
631
632 zero_user(page, from, to-from);
633 mark_page_accessed(page);
634
635 if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
636 if (!gfs2_is_writeback(ip))
637 gfs2_page_add_databufs(ip, page, from, to);
638
639 block_commit_write(page, from, to);
640 return 0;
641 }
642
643 offset = 0;
644 bh = page_buffers(page);
645 while (offset < to) {
646 if (offset >= from) {
647 set_buffer_uptodate(bh);
648 mark_buffer_dirty(bh);
649 clear_buffer_new(bh);
650 write_dirty_buffer(bh, WRITE);
651 }
652 offset += blksize;
653 bh = bh->b_this_page;
654 }
655
656 offset = 0;
657 bh = page_buffers(page);
658 while (offset < to) {
659 if (offset >= from) {
660 wait_on_buffer(bh);
661 if (!buffer_uptodate(bh))
662 return -EIO;
663 }
664 offset += blksize;
665 bh = bh->b_this_page;
666 }
667 return 0;
668}
669
670static int needs_empty_write(sector_t block, struct inode *inode)
671{
672 int error;
673 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
674
675 bh_map.b_size = 1 << inode->i_blkbits;
676 error = gfs2_block_map(inode, block, &bh_map, 0);
677 if (unlikely(error))
678 return error;
679 return !buffer_mapped(&bh_map);
680}
681
682static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
683 int mode)
684{
685 struct inode *inode = page->mapping->host;
686 unsigned start, end, next, blksize;
687 sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
688 int ret;
689
690 blksize = 1 << inode->i_blkbits;
691 next = end = 0;
692 while (next < from) {
693 next += blksize;
694 block++;
695 }
696 start = next;
697 do {
698 next += blksize;
699 ret = needs_empty_write(block, inode);
700 if (unlikely(ret < 0))
701 return ret;
702 if (ret == 0) {
703 if (end) {
704 ret = __block_write_begin(page, start, end - start,
705 gfs2_block_map);
706 if (unlikely(ret))
707 return ret;
708 ret = empty_write_end(page, start, end, mode);
709 if (unlikely(ret))
710 return ret;
711 end = 0;
712 }
713 start = next;
714 }
715 else
716 end = next;
717 block++;
718 } while (next < to);
719
720 if (end) {
721 ret = __block_write_begin(page, start, end - start, gfs2_block_map);
722 if (unlikely(ret))
723 return ret;
724 ret = empty_write_end(page, start, end, mode);
725 if (unlikely(ret))
726 return ret;
727 }
728
729 return 0;
730}
731
732static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, 672static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
733 int mode) 673 int mode)
734{ 674{
735 struct gfs2_inode *ip = GFS2_I(inode); 675 struct gfs2_inode *ip = GFS2_I(inode);
736 struct buffer_head *dibh; 676 struct buffer_head *dibh;
737 int error; 677 int error;
738 u64 start = offset >> PAGE_CACHE_SHIFT; 678 unsigned int nr_blks;
739 unsigned int start_offset = offset & ~PAGE_CACHE_MASK; 679 sector_t lblock = offset >> inode->i_blkbits;
740 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
741 pgoff_t curr;
742 struct page *page;
743 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
744 unsigned int from, to;
745
746 if (!end_offset)
747 end_offset = PAGE_CACHE_SIZE;
748 680
749 error = gfs2_meta_inode_buffer(ip, &dibh); 681 error = gfs2_meta_inode_buffer(ip, &dibh);
750 if (unlikely(error)) 682 if (unlikely(error))
751 goto out; 683 return error;
752 684
753 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 685 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
754 686
@@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
758 goto out; 690 goto out;
759 } 691 }
760 692
761 curr = start; 693 while (len) {
762 offset = start << PAGE_CACHE_SHIFT; 694 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
763 from = start_offset; 695 bh_map.b_size = len;
764 to = PAGE_CACHE_SIZE; 696 set_buffer_zeronew(&bh_map);
765 while (curr <= end) {
766 page = grab_cache_page_write_begin(inode->i_mapping, curr,
767 AOP_FLAG_NOFS);
768 if (unlikely(!page)) {
769 error = -ENOMEM;
770 goto out;
771 }
772 697
773 if (curr == end) 698 error = gfs2_block_map(inode, lblock, &bh_map, 1);
774 to = end_offset; 699 if (unlikely(error))
775 error = write_empty_blocks(page, from, to, mode);
776 if (!error && offset + to > inode->i_size &&
777 !(mode & FALLOC_FL_KEEP_SIZE)) {
778 i_size_write(inode, offset + to);
779 }
780 unlock_page(page);
781 page_cache_release(page);
782 if (error)
783 goto out; 700 goto out;
784 curr++; 701 len -= bh_map.b_size;
785 offset += PAGE_CACHE_SIZE; 702 nr_blks = bh_map.b_size >> inode->i_blkbits;
786 from = 0; 703 lblock += nr_blks;
704 if (!buffer_new(&bh_map))
705 continue;
706 if (unlikely(!buffer_zeronew(&bh_map))) {
707 error = -EIO;
708 goto out;
709 }
787 } 710 }
711 if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
712 i_size_write(inode, offset + len);
788 713
789 gfs2_dinode_out(ip, dibh->b_data);
790 mark_inode_dirty(inode); 714 mark_inode_dirty(inode);
791 715
792 brelse(dibh);
793
794out: 716out:
717 brelse(dibh);
795 return error; 718 return error;
796} 719}
797 720
@@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
799 unsigned int *data_blocks, unsigned int *ind_blocks) 722 unsigned int *data_blocks, unsigned int *ind_blocks)
800{ 723{
801 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 724 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
802 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; 725 unsigned int max_blocks = ip->i_rgd->rd_free_clone;
803 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); 726 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
804 727
805 for (tmp = max_data; tmp > sdp->sd_diptrs;) { 728 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
831 int error; 754 int error;
832 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 755 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
833 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 756 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
757 loff_t max_chunk_size = UINT_MAX & bsize_mask;
834 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 758 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
835 759
836 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 760 /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -884,11 +808,12 @@ retry:
884 goto out_qunlock; 808 goto out_qunlock;
885 } 809 }
886 max_bytes = bytes; 810 max_bytes = bytes;
887 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); 811 calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
812 &max_bytes, &data_blocks, &ind_blocks);
888 al->al_requested = data_blocks + ind_blocks; 813 al->al_requested = data_blocks + ind_blocks;
889 814
890 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + 815 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
891 RES_RG_HDR + gfs2_rg_blocks(al); 816 RES_RG_HDR + gfs2_rg_blocks(ip);
892 if (gfs2_is_jdata(ip)) 817 if (gfs2_is_jdata(ip))
893 rblocks += data_blocks ? data_blocks : 1; 818 rblocks += data_blocks ? data_blocks : 1;
894 819
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 66707118af2..2553b858a72 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
201void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 201void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
202void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 202void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
203 203
204__attribute__ ((format(printf, 2, 3))) 204__printf(2, 3)
205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
206 206
207/** 207/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc..1656df7aacd 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,40 +28,55 @@
28#include "trans.h" 28#include "trans.h"
29#include "dir.h" 29#include "dir.h"
30 30
31static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
32{
33 fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
34 bh, (unsigned long long)bh->b_blocknr, bh->b_state,
35 bh->b_page->mapping, bh->b_page->flags);
36 fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
37 gl->gl_name.ln_type, gl->gl_name.ln_number,
38 gfs2_glock2aspace(gl));
39 gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
40}
41
31/** 42/**
32 * __gfs2_ail_flush - remove all buffers for a given lock from the AIL 43 * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
33 * @gl: the glock 44 * @gl: the glock
45 * @fsync: set when called from fsync (not all buffers will be clean)
34 * 46 *
35 * None of the buffers should be dirty, locked, or pinned. 47 * None of the buffers should be dirty, locked, or pinned.
36 */ 48 */
37 49
38static void __gfs2_ail_flush(struct gfs2_glock *gl) 50static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
39{ 51{
40 struct gfs2_sbd *sdp = gl->gl_sbd; 52 struct gfs2_sbd *sdp = gl->gl_sbd;
41 struct list_head *head = &gl->gl_ail_list; 53 struct list_head *head = &gl->gl_ail_list;
42 struct gfs2_bufdata *bd; 54 struct gfs2_bufdata *bd, *tmp;
43 struct buffer_head *bh; 55 struct buffer_head *bh;
56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
57 sector_t blocknr;
44 58
59 gfs2_log_lock(sdp);
45 spin_lock(&sdp->sd_ail_lock); 60 spin_lock(&sdp->sd_ail_lock);
46 while (!list_empty(head)) { 61 list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
47 bd = list_entry(head->next, struct gfs2_bufdata,
48 bd_ail_gl_list);
49 bh = bd->bd_bh; 62 bh = bd->bd_bh;
50 gfs2_remove_from_ail(bd); 63 if (bh->b_state & b_state) {
51 bd->bd_bh = NULL; 64 if (fsync)
65 continue;
66 gfs2_ail_error(gl, bh);
67 }
68 blocknr = bh->b_blocknr;
52 bh->b_private = NULL; 69 bh->b_private = NULL;
53 spin_unlock(&sdp->sd_ail_lock); 70 gfs2_remove_from_ail(bd); /* drops ref on bh */
54 71
55 bd->bd_blkno = bh->b_blocknr; 72 bd->bd_bh = NULL;
56 gfs2_log_lock(sdp); 73 bd->bd_blkno = blocknr;
57 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
58 gfs2_trans_add_revoke(sdp, bd);
59 gfs2_log_unlock(sdp);
60 74
61 spin_lock(&sdp->sd_ail_lock); 75 gfs2_trans_add_revoke(sdp, bd);
62 } 76 }
63 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 77 BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
64 spin_unlock(&sdp->sd_ail_lock); 78 spin_unlock(&sdp->sd_ail_lock);
79 gfs2_log_unlock(sdp);
65} 80}
66 81
67 82
@@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
84 BUG_ON(current->journal_info); 99 BUG_ON(current->journal_info);
85 current->journal_info = &tr; 100 current->journal_info = &tr;
86 101
87 __gfs2_ail_flush(gl); 102 __gfs2_ail_flush(gl, 0);
88 103
89 gfs2_trans_end(sdp); 104 gfs2_trans_end(sdp);
90 gfs2_log_flush(sdp, NULL); 105 gfs2_log_flush(sdp, NULL);
91} 106}
92 107
93void gfs2_ail_flush(struct gfs2_glock *gl) 108void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
94{ 109{
95 struct gfs2_sbd *sdp = gl->gl_sbd; 110 struct gfs2_sbd *sdp = gl->gl_sbd;
96 unsigned int revokes = atomic_read(&gl->gl_ail_count); 111 unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
102 ret = gfs2_trans_begin(sdp, 0, revokes); 117 ret = gfs2_trans_begin(sdp, 0, revokes);
103 if (ret) 118 if (ret)
104 return; 119 return;
105 __gfs2_ail_flush(gl); 120 __gfs2_ail_flush(gl, fsync);
106 gfs2_trans_end(sdp); 121 gfs2_trans_end(sdp);
107 gfs2_log_flush(sdp, NULL); 122 gfs2_log_flush(sdp, NULL);
108} 123}
@@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
119static void rgrp_go_sync(struct gfs2_glock *gl) 134static void rgrp_go_sync(struct gfs2_glock *gl)
120{ 135{
121 struct address_space *metamapping = gfs2_glock2aspace(gl); 136 struct address_space *metamapping = gfs2_glock2aspace(gl);
137 struct gfs2_rgrpd *rgd;
122 int error; 138 int error;
123 139
124 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
130 error = filemap_fdatawait(metamapping); 146 error = filemap_fdatawait(metamapping);
131 mapping_set_error(metamapping, error); 147 mapping_set_error(metamapping, error);
132 gfs2_ail_empty_gl(gl); 148 gfs2_ail_empty_gl(gl);
149
150 spin_lock(&gl->gl_spin);
151 rgd = gl->gl_object;
152 if (rgd)
153 gfs2_free_clones(rgd);
154 spin_unlock(&gl->gl_spin);
133} 155}
134 156
135/** 157/**
@@ -277,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink)
277 if (nlink == 0) 299 if (nlink == 0)
278 clear_nlink(inode); 300 clear_nlink(inode);
279 else 301 else
280 inode->i_nlink = nlink; 302 set_nlink(inode, nlink);
281 } 303 }
282} 304}
283 305
@@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
430} 452}
431 453
432/** 454/**
433 * rgrp_go_lock - operation done after an rgrp lock is locked by
434 * a first holder on this node.
435 * @gl: the glock
436 * @flags:
437 *
438 * Returns: errno
439 */
440
441static int rgrp_go_lock(struct gfs2_holder *gh)
442{
443 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
444}
445
446/**
447 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
448 * a last holder on this node.
449 * @gl: the glock
450 * @flags:
451 *
452 */
453
454static void rgrp_go_unlock(struct gfs2_holder *gh)
455{
456 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
457}
458
459/**
460 * trans_go_sync - promote/demote the transaction glock 455 * trans_go_sync - promote/demote the transaction glock
461 * @gl: the glock 456 * @gl: the glock
462 * @state: the requested state 457 * @state: the requested state
@@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
558const struct gfs2_glock_operations gfs2_rgrp_glops = { 553const struct gfs2_glock_operations gfs2_rgrp_glops = {
559 .go_xmote_th = rgrp_go_sync, 554 .go_xmote_th = rgrp_go_sync,
560 .go_inval = rgrp_go_inval, 555 .go_inval = rgrp_go_inval,
561 .go_lock = rgrp_go_lock, 556 .go_lock = gfs2_rgrp_go_lock,
562 .go_unlock = rgrp_go_unlock, 557 .go_unlock = gfs2_rgrp_go_unlock,
563 .go_dump = gfs2_rgrp_dump, 558 .go_dump = gfs2_rgrp_dump,
564 .go_type = LM_TYPE_RGRP, 559 .go_type = LM_TYPE_RGRP,
565 .go_flags = GLOF_ASPACE, 560 .go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a5..bf95a2dc166 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops; 23extern const struct gfs2_glock_operations gfs2_journal_glops;
24extern const struct gfs2_glock_operations *gfs2_glops_list[]; 24extern const struct gfs2_glock_operations *gfs2_glops_list[];
25 25
26extern void gfs2_ail_flush(struct gfs2_glock *gl); 26extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
27 27
28#endif /* __GLOPS_DOT_H__ */ 28#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8a..7389dfdcc9e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/rculist_bl.h> 19#include <linux/rculist_bl.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/rbtree.h>
21 22
22#define DIO_WAIT 0x00000010 23#define DIO_WAIT 0x00000010
23#define DIO_METADATA 0x00000020 24#define DIO_METADATA 0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
78}; 79};
79 80
80struct gfs2_rgrpd { 81struct gfs2_rgrpd {
81 struct list_head rd_list; /* Link with superblock */ 82 struct rb_node rd_node; /* Link with superblock */
82 struct list_head rd_list_mru;
83 struct gfs2_glock *rd_gl; /* Glock for this rgrp */ 83 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
84 u64 rd_addr; /* grp block disk address */ 84 u64 rd_addr; /* grp block disk address */
85 u64 rd_data0; /* first data location */ 85 u64 rd_data0; /* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
91 u32 rd_dinodes; 91 u32 rd_dinodes;
92 u64 rd_igeneration; 92 u64 rd_igeneration;
93 struct gfs2_bitmap *rd_bits; 93 struct gfs2_bitmap *rd_bits;
94 struct mutex rd_mutex;
95 struct gfs2_log_element rd_le;
96 struct gfs2_sbd *rd_sbd; 94 struct gfs2_sbd *rd_sbd;
97 unsigned int rd_bh_count;
98 u32 rd_last_alloc; 95 u32 rd_last_alloc;
99 u32 rd_flags; 96 u32 rd_flags;
100#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ 97#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
@@ -106,12 +103,15 @@ struct gfs2_rgrpd {
106enum gfs2_state_bits { 103enum gfs2_state_bits {
107 BH_Pinned = BH_PrivateStart, 104 BH_Pinned = BH_PrivateStart,
108 BH_Escaped = BH_PrivateStart + 1, 105 BH_Escaped = BH_PrivateStart + 1,
106 BH_Zeronew = BH_PrivateStart + 2,
109}; 107};
110 108
111BUFFER_FNS(Pinned, pinned) 109BUFFER_FNS(Pinned, pinned)
112TAS_BUFFER_FNS(Pinned, pinned) 110TAS_BUFFER_FNS(Pinned, pinned)
113BUFFER_FNS(Escaped, escaped) 111BUFFER_FNS(Escaped, escaped)
114TAS_BUFFER_FNS(Escaped, escaped) 112TAS_BUFFER_FNS(Escaped, escaped)
113BUFFER_FNS(Zeronew, zeronew)
114TAS_BUFFER_FNS(Zeronew, zeronew)
115 115
116struct gfs2_bufdata { 116struct gfs2_bufdata {
117 struct buffer_head *bd_bh; 117 struct buffer_head *bd_bh;
@@ -246,7 +246,6 @@ struct gfs2_glock {
246 246
247struct gfs2_alloc { 247struct gfs2_alloc {
248 /* Quota stuff */ 248 /* Quota stuff */
249
250 struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; 249 struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
251 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; 250 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
252 unsigned int al_qd_num; 251 unsigned int al_qd_num;
@@ -255,18 +254,13 @@ struct gfs2_alloc {
255 u32 al_alloced; /* Filled in by gfs2_alloc_*() */ 254 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
256 255
257 /* Filled in by gfs2_inplace_reserve() */ 256 /* Filled in by gfs2_inplace_reserve() */
258
259 unsigned int al_line;
260 char *al_file;
261 struct gfs2_holder al_ri_gh;
262 struct gfs2_holder al_rgd_gh; 257 struct gfs2_holder al_rgd_gh;
263 struct gfs2_rgrpd *al_rgd;
264
265}; 258};
266 259
267enum { 260enum {
268 GIF_INVALID = 0, 261 GIF_INVALID = 0,
269 GIF_QD_LOCKED = 1, 262 GIF_QD_LOCKED = 1,
263 GIF_ALLOC_FAILED = 2,
270 GIF_SW_PAGED = 3, 264 GIF_SW_PAGED = 3,
271}; 265};
272 266
@@ -282,6 +276,7 @@ struct gfs2_inode {
282 struct gfs2_holder i_iopen_gh; 276 struct gfs2_holder i_iopen_gh;
283 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 277 struct gfs2_holder i_gh; /* for prepare/commit_write only */
284 struct gfs2_alloc *i_alloc; 278 struct gfs2_alloc *i_alloc;
279 struct gfs2_rgrpd *i_rgd;
285 u64 i_goal; /* goal block for allocations */ 280 u64 i_goal; /* goal block for allocations */
286 struct rw_semaphore i_rw_mutex; 281 struct rw_semaphore i_rw_mutex;
287 struct list_head i_trunc_list; 282 struct list_head i_trunc_list;
@@ -574,9 +569,7 @@ struct gfs2_sbd {
574 int sd_rindex_uptodate; 569 int sd_rindex_uptodate;
575 spinlock_t sd_rindex_spin; 570 spinlock_t sd_rindex_spin;
576 struct mutex sd_rindex_mutex; 571 struct mutex sd_rindex_mutex;
577 struct list_head sd_rindex_list; 572 struct rb_root sd_rindex_tree;
578 struct list_head sd_rindex_mru_list;
579 struct gfs2_rgrpd *sd_rindex_forward;
580 unsigned int sd_rgrps; 573 unsigned int sd_rgrps;
581 unsigned int sd_max_rg_data; 574 unsigned int sd_max_rg_data;
582 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6525b804d5e..cfd4959b218 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
583 goto fail_quota_locks; 583 goto fail_quota_locks;
584 584
585 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 585 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
586 al->al_rgd->rd_length + 586 dip->i_rgd->rd_length +
587 2 * RES_DINODE + 587 2 * RES_DINODE +
588 RES_STATFS + RES_QUOTA, 0); 588 RES_STATFS + RES_QUOTA, 0);
589 if (error) 589 if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
613 gfs2_trans_end(sdp); 613 gfs2_trans_end(sdp);
614 614
615fail_ipreserv: 615fail_ipreserv:
616 if (dip->i_alloc->al_rgd) 616 gfs2_inplace_release(dip);
617 gfs2_inplace_release(dip);
618 617
619fail_quota_locks: 618fail_quota_locks:
620 gfs2_quota_unlock(dip); 619 gfs2_quota_unlock(dip);
@@ -661,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
661 660
662static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, 661static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
663 unsigned int mode, dev_t dev, const char *symname, 662 unsigned int mode, dev_t dev, const char *symname,
664 unsigned int size) 663 unsigned int size, int excl)
665{ 664{
666 const struct qstr *name = &dentry->d_name; 665 const struct qstr *name = &dentry->d_name;
667 struct gfs2_holder ghs[2]; 666 struct gfs2_holder ghs[2];
@@ -681,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
681 goto fail; 680 goto fail;
682 681
683 error = create_ok(dip, name, mode); 682 error = create_ok(dip, name, mode);
683 if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
684 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
685 gfs2_glock_dq_uninit(ghs);
686 d_instantiate(dentry, inode);
687 return IS_ERR(inode) ? PTR_ERR(inode) : 0;
688 }
684 if (error) 689 if (error)
685 goto fail_gunlock; 690 goto fail_gunlock;
686 691
@@ -723,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
723 brelse(bh); 728 brelse(bh);
724 729
725 gfs2_trans_end(sdp); 730 gfs2_trans_end(sdp);
726 if (dip->i_alloc->al_rgd) 731 gfs2_inplace_release(dip);
727 gfs2_inplace_release(dip);
728 gfs2_quota_unlock(dip); 732 gfs2_quota_unlock(dip);
729 gfs2_alloc_put(dip); 733 gfs2_alloc_put(dip);
730 gfs2_glock_dq_uninit_m(2, ghs);
731 mark_inode_dirty(inode); 734 mark_inode_dirty(inode);
735 gfs2_glock_dq_uninit_m(2, ghs);
732 d_instantiate(dentry, inode); 736 d_instantiate(dentry, inode);
733 return 0; 737 return 0;
734 738
735fail_gunlock2: 739fail_gunlock2:
736 gfs2_glock_dq_uninit(ghs + 1); 740 gfs2_glock_dq_uninit(ghs + 1);
737 if (inode && !IS_ERR(inode))
738 iput(inode);
739fail_gunlock: 741fail_gunlock:
740 gfs2_glock_dq_uninit(ghs); 742 gfs2_glock_dq_uninit(ghs);
743 if (inode && !IS_ERR(inode)) {
744 set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
745 iput(inode);
746 }
741fail: 747fail:
742 if (bh) 748 if (bh)
743 brelse(bh); 749 brelse(bh);
@@ -756,24 +762,10 @@ fail:
756static int gfs2_create(struct inode *dir, struct dentry *dentry, 762static int gfs2_create(struct inode *dir, struct dentry *dentry,
757 int mode, struct nameidata *nd) 763 int mode, struct nameidata *nd)
758{ 764{
759 struct inode *inode; 765 int excl = 0;
760 int ret; 766 if (nd && (nd->flags & LOOKUP_EXCL))
761 767 excl = 1;
762 for (;;) { 768 return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
763 ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
764 if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
765 return ret;
766
767 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
768 if (inode) {
769 if (!IS_ERR(inode))
770 break;
771 return PTR_ERR(inode);
772 }
773 }
774
775 d_instantiate(dentry, inode);
776 return 0;
777} 769}
778 770
779/** 771/**
@@ -900,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
900 goto out_gunlock_q; 892 goto out_gunlock_q;
901 893
902 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
903 gfs2_rg_blocks(al) + 895 gfs2_rg_blocks(dip) +
904 2 * RES_DINODE + RES_STATFS + 896 2 * RES_DINODE + RES_STATFS +
905 RES_QUOTA, 0); 897 RES_QUOTA, 0);
906 if (error) 898 if (error)
@@ -922,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
922 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 914 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
923 inc_nlink(&ip->i_inode); 915 inc_nlink(&ip->i_inode);
924 ip->i_inode.i_ctime = CURRENT_TIME; 916 ip->i_inode.i_ctime = CURRENT_TIME;
925 gfs2_dinode_out(ip, dibh->b_data); 917 ihold(inode);
926 mark_inode_dirty(&ip->i_inode); 918 d_instantiate(dentry, inode);
919 mark_inode_dirty(inode);
927 920
928out_brelse: 921out_brelse:
929 brelse(dibh); 922 brelse(dibh);
@@ -945,11 +938,6 @@ out_child:
945out_parent: 938out_parent:
946 gfs2_holder_uninit(ghs); 939 gfs2_holder_uninit(ghs);
947 gfs2_holder_uninit(ghs + 1); 940 gfs2_holder_uninit(ghs + 1);
948 if (!error) {
949 ihold(inode);
950 d_instantiate(dentry, inode);
951 mark_inode_dirty(inode);
952 }
953 return error; 941 return error;
954} 942}
955 943
@@ -1022,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
1022 clear_nlink(inode); 1010 clear_nlink(inode);
1023 else 1011 else
1024 drop_nlink(inode); 1012 drop_nlink(inode);
1025 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1026 gfs2_dinode_out(ip, bh->b_data);
1027 mark_inode_dirty(inode); 1013 mark_inode_dirty(inode);
1028 if (inode->i_nlink == 0) 1014 if (inode->i_nlink == 0)
1029 gfs2_unlink_di(inode); 1015 gfs2_unlink_di(inode);
@@ -1051,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1051 struct buffer_head *bh; 1037 struct buffer_head *bh;
1052 struct gfs2_holder ghs[3]; 1038 struct gfs2_holder ghs[3];
1053 struct gfs2_rgrpd *rgd; 1039 struct gfs2_rgrpd *rgd;
1054 struct gfs2_holder ri_gh;
1055 int error; 1040 int error;
1056 1041
1057 error = gfs2_rindex_hold(sdp, &ri_gh);
1058 if (error)
1059 return error;
1060
1061 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 1042 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
1062 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 1043 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
1063 1044
@@ -1114,7 +1095,6 @@ out_child:
1114 gfs2_glock_dq(ghs); 1095 gfs2_glock_dq(ghs);
1115out_parent: 1096out_parent:
1116 gfs2_holder_uninit(ghs); 1097 gfs2_holder_uninit(ghs);
1117 gfs2_glock_dq_uninit(&ri_gh);
1118 return error; 1098 return error;
1119} 1099}
1120 1100
@@ -1137,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1137 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) 1117 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
1138 return -ENAMETOOLONG; 1118 return -ENAMETOOLONG;
1139 1119
1140 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); 1120 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
1141} 1121}
1142 1122
1143/** 1123/**
@@ -1151,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1151 1131
1152static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1132static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1153{ 1133{
1154 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); 1134 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
1155} 1135}
1156 1136
1157/** 1137/**
@@ -1166,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1166static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, 1146static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
1167 dev_t dev) 1147 dev_t dev)
1168{ 1148{
1169 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); 1149 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
1170} 1150}
1171 1151
1172/* 1152/*
@@ -1232,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1232 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 1212 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
1233 struct gfs2_inode *nip = NULL; 1213 struct gfs2_inode *nip = NULL;
1234 struct gfs2_sbd *sdp = GFS2_SB(odir); 1214 struct gfs2_sbd *sdp = GFS2_SB(odir);
1235 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; 1215 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
1236 struct gfs2_rgrpd *nrgd; 1216 struct gfs2_rgrpd *nrgd;
1237 unsigned int num_gh; 1217 unsigned int num_gh;
1238 int dir_rename = 0; 1218 int dir_rename = 0;
@@ -1246,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1246 return 0; 1226 return 0;
1247 } 1227 }
1248 1228
1249 error = gfs2_rindex_hold(sdp, &ri_gh);
1250 if (error)
1251 return error;
1252
1253 if (odip != ndip) { 1229 if (odip != ndip) {
1254 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 1230 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
1255 0, &r_gh); 1231 0, &r_gh);
@@ -1386,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1386 1362
1387 al->al_requested = sdp->sd_max_dirres; 1363 al->al_requested = sdp->sd_max_dirres;
1388 1364
1389 error = gfs2_inplace_reserve_ri(ndip); 1365 error = gfs2_inplace_reserve(ndip);
1390 if (error) 1366 if (error)
1391 goto out_gunlock_q; 1367 goto out_gunlock_q;
1392 1368
1393 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1369 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
1394 gfs2_rg_blocks(al) + 1370 gfs2_rg_blocks(ndip) +
1395 4 * RES_DINODE + 4 * RES_LEAF + 1371 4 * RES_DINODE + 4 * RES_LEAF +
1396 RES_STATFS + RES_QUOTA + 4, 0); 1372 RES_STATFS + RES_QUOTA + 4, 0);
1397 if (error) 1373 if (error)
@@ -1457,7 +1433,6 @@ out_gunlock_r:
1457 if (r_gh.gh_gl) 1433 if (r_gh.gh_gl)
1458 gfs2_glock_dq_uninit(&r_gh); 1434 gfs2_glock_dq_uninit(&r_gh);
1459out: 1435out:
1460 gfs2_glock_dq_uninit(&ri_gh);
1461 return error; 1436 return error;
1462} 1437}
1463 1438
@@ -1561,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask)
1561 return error; 1536 return error;
1562} 1537}
1563 1538
1564static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1539static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
1565{ 1540{
1566 struct inode *inode = &ip->i_inode;
1567 struct buffer_head *dibh;
1568 int error;
1569
1570 error = gfs2_meta_inode_buffer(ip, &dibh);
1571 if (error)
1572 return error;
1573
1574 setattr_copy(inode, attr); 1541 setattr_copy(inode, attr);
1575 mark_inode_dirty(inode); 1542 mark_inode_dirty(inode);
1576 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1577 gfs2_dinode_out(ip, dibh->b_data);
1578 brelse(dibh);
1579 return 0; 1543 return 0;
1580} 1544}
1581 1545
@@ -1587,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1587 * Returns: errno 1551 * Returns: errno
1588 */ 1552 */
1589 1553
1590int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1554int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
1591{ 1555{
1592 int error; 1556 int error;
1593 1557
1594 if (current->journal_info) 1558 if (current->journal_info)
1595 return __gfs2_setattr_simple(ip, attr); 1559 return __gfs2_setattr_simple(inode, attr);
1596 1560
1597 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); 1561 error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
1598 if (error) 1562 if (error)
1599 return error; 1563 return error;
1600 1564
1601 error = __gfs2_setattr_simple(ip, attr); 1565 error = __gfs2_setattr_simple(inode, attr);
1602 gfs2_trans_end(GFS2_SB(&ip->i_inode)); 1566 gfs2_trans_end(GFS2_SB(inode));
1603 return error; 1567 return error;
1604} 1568}
1605 1569
@@ -1637,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1637 if (error) 1601 if (error)
1638 goto out_gunlock_q; 1602 goto out_gunlock_q;
1639 1603
1640 error = gfs2_setattr_simple(ip, attr); 1604 error = gfs2_setattr_simple(inode, attr);
1641 if (error) 1605 if (error)
1642 goto out_end_trans; 1606 goto out_end_trans;
1643 1607
@@ -1693,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1693 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1657 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1694 error = gfs2_acl_chmod(ip, attr); 1658 error = gfs2_acl_chmod(ip, attr);
1695 else 1659 else
1696 error = gfs2_setattr_simple(ip, attr); 1660 error = gfs2_setattr_simple(inode, attr);
1697 1661
1698out: 1662out:
1699 gfs2_glock_dq_uninit(&i_gh);
1700 if (!error) 1663 if (!error)
1701 mark_inode_dirty(inode); 1664 mark_inode_dirty(inode);
1665 gfs2_glock_dq_uninit(&i_gh);
1702 return error; 1666 return error;
1703} 1667}
1704 1668
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c0767..276e7b52b65 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
109extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, 109extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
110 int is_root); 110 int is_root);
111extern int gfs2_permission(struct inode *inode, int mask); 111extern int gfs2_permission(struct inode *inode, int mask);
112extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 112extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
113extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 113extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
114extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 114extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
115 115
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699..0301be655b1 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
60 trace_gfs2_pin(bd, 1); 60 trace_gfs2_pin(bd, 1);
61} 61}
62 62
63static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
64{
65 return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
66}
67
68static void maybe_release_space(struct gfs2_bufdata *bd)
69{
70 struct gfs2_glock *gl = bd->bd_gl;
71 struct gfs2_sbd *sdp = gl->gl_sbd;
72 struct gfs2_rgrpd *rgd = gl->gl_object;
73 unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
74 struct gfs2_bitmap *bi = rgd->rd_bits + index;
75
76 if (bi->bi_clone == 0)
77 return;
78 if (sdp->sd_args.ar_discard)
79 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
80 memcpy(bi->bi_clone + bi->bi_offset,
81 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
82 clear_bit(GBF_FULL, &bi->bi_flags);
83 rgd->rd_free_clone = rgd->rd_free;
84}
85
63/** 86/**
64 * gfs2_unpin - Unpin a buffer 87 * gfs2_unpin - Unpin a buffer
65 * @sdp: the filesystem the buffer belongs to 88 * @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
81 mark_buffer_dirty(bh); 104 mark_buffer_dirty(bh);
82 clear_buffer_pinned(bh); 105 clear_buffer_pinned(bh);
83 106
107 if (buffer_is_rgrp(bd))
108 maybe_release_space(bd);
109
84 spin_lock(&sdp->sd_ail_lock); 110 spin_lock(&sdp->sd_ail_lock);
85 if (bd->bd_ail) { 111 if (bd->bd_ail) {
86 list_del(&bd->bd_ail_st_list); 112 list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
469 gfs2_revoke_clean(sdp); 495 gfs2_revoke_clean(sdp);
470} 496}
471 497
472static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
473{
474 struct gfs2_rgrpd *rgd;
475 struct gfs2_trans *tr = current->journal_info;
476
477 tr->tr_touched = 1;
478
479 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
480
481 gfs2_log_lock(sdp);
482 if (!list_empty(&le->le_list)){
483 gfs2_log_unlock(sdp);
484 return;
485 }
486 gfs2_rgrp_bh_hold(rgd);
487 sdp->sd_log_num_rg++;
488 list_add(&le->le_list, &sdp->sd_log_le_rg);
489 gfs2_log_unlock(sdp);
490}
491
492static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
493{
494 struct list_head *head = &sdp->sd_log_le_rg;
495 struct gfs2_rgrpd *rgd;
496
497 while (!list_empty(head)) {
498 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
499 list_del_init(&rgd->rd_le.le_list);
500 sdp->sd_log_num_rg--;
501
502 gfs2_rgrp_repolish_clones(rgd);
503 gfs2_rgrp_bh_put(rgd);
504 }
505 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
506}
507
508/** 498/**
509 * databuf_lo_add - Add a databuf to the transaction. 499 * databuf_lo_add - Add a databuf to the transaction.
510 * 500 *
@@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
705 695
706 brelse(bh_log); 696 brelse(bh_log);
707 brelse(bh_ip); 697 brelse(bh_ip);
708 if (error)
709 break;
710 698
711 sdp->sd_replayed_blocks++; 699 sdp->sd_replayed_blocks++;
712 } 700 }
@@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
771}; 759};
772 760
773const struct gfs2_log_operations gfs2_rg_lops = { 761const struct gfs2_log_operations gfs2_rg_lops = {
774 .lo_add = rg_lo_add,
775 .lo_after_commit = rg_lo_after_commit,
776 .lo_name = "rg", 762 .lo_name = "rg",
777}; 763};
778 764
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 079587e5384..cb23c2be731 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/export.h>
17#include <linux/namei.h> 18#include <linux/namei.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
@@ -77,8 +78,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
77 78
78 spin_lock_init(&sdp->sd_rindex_spin); 79 spin_lock_init(&sdp->sd_rindex_spin);
79 mutex_init(&sdp->sd_rindex_mutex); 80 mutex_init(&sdp->sd_rindex_mutex);
80 INIT_LIST_HEAD(&sdp->sd_rindex_list); 81 sdp->sd_rindex_tree.rb_node = NULL;
81 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
82 82
83 INIT_LIST_HEAD(&sdp->sd_jindex_list); 83 INIT_LIST_HEAD(&sdp->sd_jindex_list);
84 spin_lock_init(&sdp->sd_jindex_spin); 84 spin_lock_init(&sdp->sd_jindex_spin);
@@ -652,7 +652,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
652 fs_err(sdp, "can't lookup journal index: %d\n", error); 652 fs_err(sdp, "can't lookup journal index: %d\n", error);
653 return PTR_ERR(sdp->sd_jindex); 653 return PTR_ERR(sdp->sd_jindex);
654 } 654 }
655 ip = GFS2_I(sdp->sd_jindex);
656 655
657 /* Load in the journal index special file */ 656 /* Load in the journal index special file */
658 657
@@ -764,7 +763,6 @@ fail:
764static int init_inodes(struct gfs2_sbd *sdp, int undo) 763static int init_inodes(struct gfs2_sbd *sdp, int undo)
765{ 764{
766 int error = 0; 765 int error = 0;
767 struct gfs2_inode *ip;
768 struct inode *master = sdp->sd_master_dir->d_inode; 766 struct inode *master = sdp->sd_master_dir->d_inode;
769 767
770 if (undo) 768 if (undo)
@@ -789,7 +787,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
789 fs_err(sdp, "can't get resource index inode: %d\n", error); 787 fs_err(sdp, "can't get resource index inode: %d\n", error);
790 goto fail_statfs; 788 goto fail_statfs;
791 } 789 }
792 ip = GFS2_I(sdp->sd_rindex);
793 sdp->sd_rindex_uptodate = 0; 790 sdp->sd_rindex_uptodate = 0;
794 791
795 /* Read in the quota inode */ 792 /* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0e8bb13381e..7e528dc14f8 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
638 unsigned long index = loc >> PAGE_CACHE_SHIFT; 638 unsigned long index = loc >> PAGE_CACHE_SHIFT;
639 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 639 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
640 unsigned blocksize, iblock, pos; 640 unsigned blocksize, iblock, pos;
641 struct buffer_head *bh, *dibh; 641 struct buffer_head *bh;
642 struct page *page; 642 struct page *page;
643 void *kaddr, *ptr; 643 void *kaddr, *ptr;
644 struct gfs2_quota q, *qp; 644 struct gfs2_quota q, *qp;
645 int err, nbytes; 645 int err, nbytes;
646 u64 size; 646 u64 size;
647 647
648 if (gfs2_is_stuffed(ip)) 648 if (gfs2_is_stuffed(ip)) {
649 gfs2_unstuff_dinode(ip, NULL); 649 err = gfs2_unstuff_dinode(ip, NULL);
650 if (err)
651 return err;
652 }
650 653
651 memset(&q, 0, sizeof(struct gfs2_quota)); 654 memset(&q, 0, sizeof(struct gfs2_quota));
652 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); 655 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -736,22 +739,13 @@ get_a_page:
736 goto get_a_page; 739 goto get_a_page;
737 } 740 }
738 741
739 /* Update the disk inode timestamp and size (if extended) */
740 err = gfs2_meta_inode_buffer(ip, &dibh);
741 if (err)
742 goto out;
743
744 size = loc + sizeof(struct gfs2_quota); 742 size = loc + sizeof(struct gfs2_quota);
745 if (size > inode->i_size) 743 if (size > inode->i_size)
746 i_size_write(inode, size); 744 i_size_write(inode, size);
747 inode->i_mtime = inode->i_atime = CURRENT_TIME; 745 inode->i_mtime = inode->i_atime = CURRENT_TIME;
748 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
749 gfs2_dinode_out(ip, dibh->b_data);
750 brelse(dibh);
751 mark_inode_dirty(inode); 746 mark_inode_dirty(inode);
752
753out:
754 return err; 747 return err;
748
755unlock_out: 749unlock_out:
756 unlock_page(page); 750 unlock_page(page);
757 page_cache_release(page); 751 page_cache_release(page);
@@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
822 goto out_alloc; 816 goto out_alloc;
823 817
824 if (nalloc) 818 if (nalloc)
825 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; 819 blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
826 820
827 error = gfs2_trans_begin(sdp, blocks, 0); 821 error = gfs2_trans_begin(sdp, blocks, 0);
828 if (error) 822 if (error)
@@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
936 unsigned int x; 930 unsigned int x;
937 int error = 0; 931 int error = 0;
938 932
939 gfs2_quota_hold(ip, uid, gid); 933 error = gfs2_quota_hold(ip, uid, gid);
934 if (error)
935 return error;
940 936
941 if (capable(CAP_SYS_RESOURCE) || 937 if (capable(CAP_SYS_RESOURCE) ||
942 sdp->sd_args.ar_quota != GFS2_QUOTA_ON) 938 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
@@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1607 error = gfs2_inplace_reserve(ip); 1603 error = gfs2_inplace_reserve(ip);
1608 if (error) 1604 if (error)
1609 goto out_alloc; 1605 goto out_alloc;
1610 blocks += gfs2_rg_blocks(al); 1606 blocks += gfs2_rg_blocks(ip);
1611 } 1607 }
1612 1608
1613 /* Some quotas span block boundaries and can update two blocks, 1609 /* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02d..96bd6d759f2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/rbtree.h>
18 19
19#include "gfs2.h" 20#include "gfs2.h"
20#include "incore.h" 21#include "incore.h"
@@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
328 329
329struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) 330struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
330{ 331{
331 struct gfs2_rgrpd *rgd; 332 struct rb_node **newn;
333 struct gfs2_rgrpd *cur;
332 334
333 spin_lock(&sdp->sd_rindex_spin); 335 spin_lock(&sdp->sd_rindex_spin);
334 336 newn = &sdp->sd_rindex_tree.rb_node;
335 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { 337 while (*newn) {
336 if (rgrp_contains_block(rgd, blk)) { 338 cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
337 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); 339 if (blk < cur->rd_addr)
340 newn = &((*newn)->rb_left);
341 else if (blk >= cur->rd_data0 + cur->rd_data)
342 newn = &((*newn)->rb_right);
343 else {
338 spin_unlock(&sdp->sd_rindex_spin); 344 spin_unlock(&sdp->sd_rindex_spin);
339 return rgd; 345 return cur;
340 } 346 }
341 } 347 }
342
343 spin_unlock(&sdp->sd_rindex_spin); 348 spin_unlock(&sdp->sd_rindex_spin);
344 349
345 return NULL; 350 return NULL;
@@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
354 359
355struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) 360struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
356{ 361{
357 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); 362 const struct rb_node *n;
358 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); 363 struct gfs2_rgrpd *rgd;
364
365 spin_lock(&sdp->sd_rindex_spin);
366 n = rb_first(&sdp->sd_rindex_tree);
367 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
368 spin_unlock(&sdp->sd_rindex_spin);
369
370 return rgd;
359} 371}
360 372
361/** 373/**
@@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
367 379
368struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) 380struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
369{ 381{
370 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) 382 struct gfs2_sbd *sdp = rgd->rd_sbd;
383 const struct rb_node *n;
384
385 spin_lock(&sdp->sd_rindex_spin);
386 n = rb_next(&rgd->rd_node);
387 if (n == NULL)
388 n = rb_first(&sdp->sd_rindex_tree);
389
390 if (unlikely(&rgd->rd_node == n)) {
391 spin_unlock(&sdp->sd_rindex_spin);
371 return NULL; 392 return NULL;
372 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); 393 }
394 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
395 spin_unlock(&sdp->sd_rindex_spin);
396 return rgd;
373} 397}
374 398
375static void clear_rgrpdi(struct gfs2_sbd *sdp) 399void gfs2_free_clones(struct gfs2_rgrpd *rgd)
376{ 400{
377 struct list_head *head; 401 int x;
402
403 for (x = 0; x < rgd->rd_length; x++) {
404 struct gfs2_bitmap *bi = rgd->rd_bits + x;
405 kfree(bi->bi_clone);
406 bi->bi_clone = NULL;
407 }
408}
409
410void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
411{
412 struct rb_node *n;
378 struct gfs2_rgrpd *rgd; 413 struct gfs2_rgrpd *rgd;
379 struct gfs2_glock *gl; 414 struct gfs2_glock *gl;
380 415
381 spin_lock(&sdp->sd_rindex_spin); 416 while ((n = rb_first(&sdp->sd_rindex_tree))) {
382 sdp->sd_rindex_forward = NULL; 417 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
383 spin_unlock(&sdp->sd_rindex_spin);
384
385 head = &sdp->sd_rindex_list;
386 while (!list_empty(head)) {
387 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
388 gl = rgd->rd_gl; 418 gl = rgd->rd_gl;
389 419
390 list_del(&rgd->rd_list); 420 rb_erase(n, &sdp->sd_rindex_tree);
391 list_del(&rgd->rd_list_mru);
392 421
393 if (gl) { 422 if (gl) {
423 spin_lock(&gl->gl_spin);
394 gl->gl_object = NULL; 424 gl->gl_object = NULL;
425 spin_unlock(&gl->gl_spin);
395 gfs2_glock_add_to_lru(gl); 426 gfs2_glock_add_to_lru(gl);
396 gfs2_glock_put(gl); 427 gfs2_glock_put(gl);
397 } 428 }
398 429
430 gfs2_free_clones(rgd);
399 kfree(rgd->rd_bits); 431 kfree(rgd->rd_bits);
400 kmem_cache_free(gfs2_rgrpd_cachep, rgd); 432 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
401 } 433 }
402} 434}
403 435
404void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
405{
406 mutex_lock(&sdp->sd_rindex_mutex);
407 clear_rgrpdi(sdp);
408 mutex_unlock(&sdp->sd_rindex_mutex);
409}
410
411static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) 436static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
412{ 437{
413 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); 438 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
524 return total_data; 549 return total_data;
525} 550}
526 551
527static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) 552static void rgd_insert(struct gfs2_rgrpd *rgd)
528{ 553{
529 const struct gfs2_rindex *str = buf; 554 struct gfs2_sbd *sdp = rgd->rd_sbd;
555 struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
556
557 /* Figure out where to put new node */
558 while (*newn) {
559 struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
560 rd_node);
561
562 parent = *newn;
563 if (rgd->rd_addr < cur->rd_addr)
564 newn = &((*newn)->rb_left);
565 else if (rgd->rd_addr > cur->rd_addr)
566 newn = &((*newn)->rb_right);
567 else
568 return;
569 }
530 570
531 rgd->rd_addr = be64_to_cpu(str->ri_addr); 571 rb_link_node(&rgd->rd_node, parent, newn);
532 rgd->rd_length = be32_to_cpu(str->ri_length); 572 rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
533 rgd->rd_data0 = be64_to_cpu(str->ri_data0);
534 rgd->rd_data = be32_to_cpu(str->ri_data);
535 rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
536} 573}
537 574
538/** 575/**
539 * read_rindex_entry - Pull in a new resource index entry from the disk 576 * read_rindex_entry - Pull in a new resource index entry from the disk
540 * @gl: The glock covering the rindex inode 577 * @gl: The glock covering the rindex inode
541 * 578 *
542 * Returns: 0 on success, error code otherwise 579 * Returns: 0 on success, > 0 on EOF, error code otherwise
543 */ 580 */
544 581
545static int read_rindex_entry(struct gfs2_inode *ip, 582static int read_rindex_entry(struct gfs2_inode *ip,
@@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip,
547{ 584{
548 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 585 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
549 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 586 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
550 char buf[sizeof(struct gfs2_rindex)]; 587 struct gfs2_rindex buf;
551 int error; 588 int error;
552 struct gfs2_rgrpd *rgd; 589 struct gfs2_rgrpd *rgd;
553 590
554 error = gfs2_internal_read(ip, ra_state, buf, &pos, 591 if (pos >= i_size_read(&ip->i_inode))
592 return 1;
593
594 error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
555 sizeof(struct gfs2_rindex)); 595 sizeof(struct gfs2_rindex));
556 if (!error) 596
557 return 0; 597 if (error != sizeof(struct gfs2_rindex))
558 if (error != sizeof(struct gfs2_rindex)) { 598 return (error == 0) ? 1 : error;
559 if (error > 0)
560 error = -EIO;
561 return error;
562 }
563 599
564 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); 600 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
565 error = -ENOMEM; 601 error = -ENOMEM;
566 if (!rgd) 602 if (!rgd)
567 return error; 603 return error;
568 604
569 mutex_init(&rgd->rd_mutex);
570 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
571 rgd->rd_sbd = sdp; 605 rgd->rd_sbd = sdp;
606 rgd->rd_addr = be64_to_cpu(buf.ri_addr);
607 rgd->rd_length = be32_to_cpu(buf.ri_length);
608 rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
609 rgd->rd_data = be32_to_cpu(buf.ri_data);
610 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
572 611
573 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
574 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
575
576 gfs2_rindex_in(rgd, buf);
577 error = compute_bitstructs(rgd); 612 error = compute_bitstructs(rgd);
578 if (error) 613 if (error)
579 return error; 614 goto fail;
580 615
581 error = gfs2_glock_get(sdp, rgd->rd_addr, 616 error = gfs2_glock_get(sdp, rgd->rd_addr,
582 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); 617 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
583 if (error) 618 if (error)
584 return error; 619 goto fail;
585 620
586 rgd->rd_gl->gl_object = rgd; 621 rgd->rd_gl->gl_object = rgd;
587 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 622 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
623 if (rgd->rd_data > sdp->sd_max_rg_data)
624 sdp->sd_max_rg_data = rgd->rd_data;
625 spin_lock(&sdp->sd_rindex_spin);
626 rgd_insert(rgd);
627 sdp->sd_rgrps++;
628 spin_unlock(&sdp->sd_rindex_spin);
629 return error;
630
631fail:
632 kfree(rgd->rd_bits);
633 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
588 return error; 634 return error;
589} 635}
590 636
@@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
595 * Returns: 0 on successful update, error code otherwise 641 * Returns: 0 on successful update, error code otherwise
596 */ 642 */
597 643
598int gfs2_ri_update(struct gfs2_inode *ip) 644static int gfs2_ri_update(struct gfs2_inode *ip)
599{ 645{
600 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 646 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
601 struct inode *inode = &ip->i_inode; 647 struct inode *inode = &ip->i_inode;
602 struct file_ra_state ra_state; 648 struct file_ra_state ra_state;
603 u64 rgrp_count = i_size_read(inode);
604 struct gfs2_rgrpd *rgd;
605 unsigned int max_data = 0;
606 int error; 649 int error;
607 650
608 do_div(rgrp_count, sizeof(struct gfs2_rindex));
609 clear_rgrpdi(sdp);
610
611 file_ra_state_init(&ra_state, inode->i_mapping); 651 file_ra_state_init(&ra_state, inode->i_mapping);
612 for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { 652 do {
613 error = read_rindex_entry(ip, &ra_state); 653 error = read_rindex_entry(ip, &ra_state);
614 if (error) { 654 } while (error == 0);
615 clear_rgrpdi(sdp); 655
616 return error; 656 if (error < 0)
617 } 657 return error;
618 }
619 658
620 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
621 if (rgd->rd_data > max_data)
622 max_data = rgd->rd_data;
623 sdp->sd_max_rg_data = max_data;
624 sdp->sd_rindex_uptodate = 1; 659 sdp->sd_rindex_uptodate = 1;
625 return 0; 660 return 0;
626} 661}
627 662
628/** 663/**
629 * gfs2_rindex_hold - Grab a lock on the rindex 664 * gfs2_rindex_update - Update the rindex if required
630 * @sdp: The GFS2 superblock 665 * @sdp: The GFS2 superblock
631 * @ri_gh: the glock holder
632 * 666 *
633 * We grab a lock on the rindex inode to make sure that it doesn't 667 * We grab a lock on the rindex inode to make sure that it doesn't
634 * change whilst we are performing an operation. We keep this lock 668 * change whilst we are performing an operation. We keep this lock
@@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
640 * special file, which might have been updated if someone expanded the 674 * special file, which might have been updated if someone expanded the
641 * filesystem (via gfs2_grow utility), which adds new resource groups. 675 * filesystem (via gfs2_grow utility), which adds new resource groups.
642 * 676 *
643 * Returns: 0 on success, error code otherwise 677 * Returns: 0 on succeess, error code otherwise
644 */ 678 */
645 679
646int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) 680int gfs2_rindex_update(struct gfs2_sbd *sdp)
647{ 681{
648 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); 682 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
649 struct gfs2_glock *gl = ip->i_gl; 683 struct gfs2_glock *gl = ip->i_gl;
650 int error; 684 struct gfs2_holder ri_gh;
651 685 int error = 0;
652 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
653 if (error)
654 return error;
655 686
656 /* Read new copy from disk if we don't have the latest */ 687 /* Read new copy from disk if we don't have the latest */
657 if (!sdp->sd_rindex_uptodate) { 688 if (!sdp->sd_rindex_uptodate) {
658 mutex_lock(&sdp->sd_rindex_mutex); 689 mutex_lock(&sdp->sd_rindex_mutex);
659 if (!sdp->sd_rindex_uptodate) { 690 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
691 if (error)
692 return error;
693 if (!sdp->sd_rindex_uptodate)
660 error = gfs2_ri_update(ip); 694 error = gfs2_ri_update(ip);
661 if (error) 695 gfs2_glock_dq_uninit(&ri_gh);
662 gfs2_glock_dq_uninit(ri_gh);
663 }
664 mutex_unlock(&sdp->sd_rindex_mutex); 696 mutex_unlock(&sdp->sd_rindex_mutex);
665 } 697 }
666 698
699
667 return error; 700 return error;
668} 701}
669 702
@@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
694} 727}
695 728
696/** 729/**
697 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps 730 * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
698 * @rgd: the struct gfs2_rgrpd describing the RG to read in 731 * @rgd: the struct gfs2_rgrpd describing the RG to read in
699 * 732 *
700 * Read in all of a Resource Group's header and bitmap blocks. 733 * Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
703 * Returns: errno 736 * Returns: errno
704 */ 737 */
705 738
706int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) 739int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
707{ 740{
741 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
708 struct gfs2_sbd *sdp = rgd->rd_sbd; 742 struct gfs2_sbd *sdp = rgd->rd_sbd;
709 struct gfs2_glock *gl = rgd->rd_gl; 743 struct gfs2_glock *gl = rgd->rd_gl;
710 unsigned int length = rgd->rd_length; 744 unsigned int length = rgd->rd_length;
@@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
712 unsigned int x, y; 746 unsigned int x, y;
713 int error; 747 int error;
714 748
715 mutex_lock(&rgd->rd_mutex);
716
717 spin_lock(&sdp->sd_rindex_spin);
718 if (rgd->rd_bh_count) {
719 rgd->rd_bh_count++;
720 spin_unlock(&sdp->sd_rindex_spin);
721 mutex_unlock(&rgd->rd_mutex);
722 return 0;
723 }
724 spin_unlock(&sdp->sd_rindex_spin);
725
726 for (x = 0; x < length; x++) { 749 for (x = 0; x < length; x++) {
727 bi = rgd->rd_bits + x; 750 bi = rgd->rd_bits + x;
728 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); 751 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
747 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); 770 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
748 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 771 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
749 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 772 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
773 rgd->rd_free_clone = rgd->rd_free;
750 } 774 }
751 775
752 spin_lock(&sdp->sd_rindex_spin);
753 rgd->rd_free_clone = rgd->rd_free;
754 rgd->rd_bh_count++;
755 spin_unlock(&sdp->sd_rindex_spin);
756
757 mutex_unlock(&rgd->rd_mutex);
758
759 return 0; 776 return 0;
760 777
761fail: 778fail:
@@ -765,52 +782,32 @@ fail:
765 bi->bi_bh = NULL; 782 bi->bi_bh = NULL;
766 gfs2_assert_warn(sdp, !bi->bi_clone); 783 gfs2_assert_warn(sdp, !bi->bi_clone);
767 } 784 }
768 mutex_unlock(&rgd->rd_mutex);
769 785
770 return error; 786 return error;
771} 787}
772 788
773void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
774{
775 struct gfs2_sbd *sdp = rgd->rd_sbd;
776
777 spin_lock(&sdp->sd_rindex_spin);
778 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
779 rgd->rd_bh_count++;
780 spin_unlock(&sdp->sd_rindex_spin);
781}
782
783/** 789/**
784 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() 790 * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
785 * @rgd: the struct gfs2_rgrpd describing the RG to read in 791 * @rgd: the struct gfs2_rgrpd describing the RG to read in
786 * 792 *
787 */ 793 */
788 794
789void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) 795void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
790{ 796{
791 struct gfs2_sbd *sdp = rgd->rd_sbd; 797 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
792 int x, length = rgd->rd_length; 798 int x, length = rgd->rd_length;
793 799
794 spin_lock(&sdp->sd_rindex_spin);
795 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
796 if (--rgd->rd_bh_count) {
797 spin_unlock(&sdp->sd_rindex_spin);
798 return;
799 }
800
801 for (x = 0; x < length; x++) { 800 for (x = 0; x < length; x++) {
802 struct gfs2_bitmap *bi = rgd->rd_bits + x; 801 struct gfs2_bitmap *bi = rgd->rd_bits + x;
803 kfree(bi->bi_clone);
804 bi->bi_clone = NULL;
805 brelse(bi->bi_bh); 802 brelse(bi->bi_bh);
806 bi->bi_bh = NULL; 803 bi->bi_bh = NULL;
807 } 804 }
808 805
809 spin_unlock(&sdp->sd_rindex_spin);
810} 806}
811 807
812static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 808void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
813 const struct gfs2_bitmap *bi) 809 struct buffer_head *bh,
810 const struct gfs2_bitmap *bi)
814{ 811{
815 struct super_block *sb = sdp->sd_vfs; 812 struct super_block *sb = sdp->sd_vfs;
816 struct block_device *bdev = sb->s_bdev; 813 struct block_device *bdev = sb->s_bdev;
@@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
823 unsigned int x; 820 unsigned int x;
824 821
825 for (x = 0; x < bi->bi_len; x++) { 822 for (x = 0; x < bi->bi_len; x++) {
826 const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; 823 const u8 *orig = bh->b_data + bi->bi_offset + x;
827 const u8 *clone = bi->bi_clone + bi->bi_offset + x; 824 const u8 *clone = bi->bi_clone + bi->bi_offset + x;
828 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); 825 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
829 diff &= 0x55; 826 diff &= 0x55;
@@ -862,28 +859,6 @@ fail:
862 sdp->sd_args.ar_discard = 0; 859 sdp->sd_args.ar_discard = 0;
863} 860}
864 861
865void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
866{
867 struct gfs2_sbd *sdp = rgd->rd_sbd;
868 unsigned int length = rgd->rd_length;
869 unsigned int x;
870
871 for (x = 0; x < length; x++) {
872 struct gfs2_bitmap *bi = rgd->rd_bits + x;
873 if (!bi->bi_clone)
874 continue;
875 if (sdp->sd_args.ar_discard)
876 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
877 clear_bit(GBF_FULL, &bi->bi_flags);
878 memcpy(bi->bi_clone + bi->bi_offset,
879 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
880 }
881
882 spin_lock(&sdp->sd_rindex_spin);
883 rgd->rd_free_clone = rgd->rd_free;
884 spin_unlock(&sdp->sd_rindex_spin);
885}
886
887/** 862/**
888 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode 863 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
889 * @ip: the incore GFS2 inode structure 864 * @ip: the incore GFS2 inode structure
@@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
893 868
894struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 869struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
895{ 870{
871 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
872 int error;
896 BUG_ON(ip->i_alloc != NULL); 873 BUG_ON(ip->i_alloc != NULL);
897 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); 874 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
875 error = gfs2_rindex_update(sdp);
876 if (error)
877 fs_warn(sdp, "rindex update returns %d\n", error);
898 return ip->i_alloc; 878 return ip->i_alloc;
899} 879}
900 880
901/** 881/**
902 * try_rgrp_fit - See if a given reservation will fit in a given RG 882 * try_rgrp_fit - See if a given reservation will fit in a given RG
903 * @rgd: the RG data 883 * @rgd: the RG data
904 * @al: the struct gfs2_alloc structure describing the reservation 884 * @ip: the inode
905 * 885 *
906 * If there's room for the requested blocks to be allocated from the RG: 886 * If there's room for the requested blocks to be allocated from the RG:
907 * Sets the $al_rgd field in @al.
908 * 887 *
909 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) 888 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
910 */ 889 */
911 890
912static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) 891static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
913{ 892{
914 struct gfs2_sbd *sdp = rgd->rd_sbd; 893 const struct gfs2_alloc *al = ip->i_alloc;
915 int ret = 0;
916 894
917 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 895 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
918 return 0; 896 return 0;
919 897 if (rgd->rd_free_clone >= al->al_requested)
920 spin_lock(&sdp->sd_rindex_spin); 898 return 1;
921 if (rgd->rd_free_clone >= al->al_requested) { 899 return 0;
922 al->al_rgd = rgd;
923 ret = 1;
924 }
925 spin_unlock(&sdp->sd_rindex_spin);
926
927 return ret;
928} 900}
929 901
930/** 902/**
@@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
992} 964}
993 965
994/** 966/**
995 * recent_rgrp_next - get next RG from "recent" list
996 * @cur_rgd: current rgrp
997 *
998 * Returns: The next rgrp in the recent list
999 */
1000
1001static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
1002{
1003 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
1004 struct list_head *head;
1005 struct gfs2_rgrpd *rgd;
1006
1007 spin_lock(&sdp->sd_rindex_spin);
1008 head = &sdp->sd_rindex_mru_list;
1009 if (unlikely(cur_rgd->rd_list_mru.next == head)) {
1010 spin_unlock(&sdp->sd_rindex_spin);
1011 return NULL;
1012 }
1013 rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
1014 spin_unlock(&sdp->sd_rindex_spin);
1015 return rgd;
1016}
1017
1018/**
1019 * forward_rgrp_get - get an rgrp to try next from full list
1020 * @sdp: The GFS2 superblock
1021 *
1022 * Returns: The rgrp to try next
1023 */
1024
1025static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
1026{
1027 struct gfs2_rgrpd *rgd;
1028 unsigned int journals = gfs2_jindex_size(sdp);
1029 unsigned int rg = 0, x;
1030
1031 spin_lock(&sdp->sd_rindex_spin);
1032
1033 rgd = sdp->sd_rindex_forward;
1034 if (!rgd) {
1035 if (sdp->sd_rgrps >= journals)
1036 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
1037
1038 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
1039 x++, rgd = gfs2_rgrpd_get_next(rgd))
1040 /* Do Nothing */;
1041
1042 sdp->sd_rindex_forward = rgd;
1043 }
1044
1045 spin_unlock(&sdp->sd_rindex_spin);
1046
1047 return rgd;
1048}
1049
1050/**
1051 * forward_rgrp_set - set the forward rgrp pointer
1052 * @sdp: the filesystem
1053 * @rgd: The new forward rgrp
1054 *
1055 */
1056
1057static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1058{
1059 spin_lock(&sdp->sd_rindex_spin);
1060 sdp->sd_rindex_forward = rgd;
1061 spin_unlock(&sdp->sd_rindex_spin);
1062}
1063
1064/**
1065 * get_local_rgrp - Choose and lock a rgrp for allocation 967 * get_local_rgrp - Choose and lock a rgrp for allocation
1066 * @ip: the inode to reserve space for 968 * @ip: the inode to reserve space for
1067 * @rgp: the chosen and locked rgrp 969 * @rgp: the chosen and locked rgrp
@@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1076 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 978 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1077 struct gfs2_rgrpd *rgd, *begin = NULL; 979 struct gfs2_rgrpd *rgd, *begin = NULL;
1078 struct gfs2_alloc *al = ip->i_alloc; 980 struct gfs2_alloc *al = ip->i_alloc;
1079 int flags = LM_FLAG_TRY;
1080 int skipped = 0;
1081 int loops = 0;
1082 int error, rg_locked; 981 int error, rg_locked;
982 int loops = 0;
983
984 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
985 rgd = begin = ip->i_rgd;
986 else
987 rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
1083 988
1084 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 989 if (rgd == NULL)
990 return -EBADSLT;
1085 991
1086 while (rgd) { 992 while (loops < 3) {
1087 rg_locked = 0; 993 rg_locked = 0;
1088 994
1089 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { 995 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1095 } 1001 }
1096 switch (error) { 1002 switch (error) {
1097 case 0: 1003 case 0:
1098 if (try_rgrp_fit(rgd, al)) 1004 if (try_rgrp_fit(rgd, ip)) {
1099 goto out; 1005 ip->i_rgd = rgd;
1006 return 0;
1007 }
1100 if (rgd->rd_flags & GFS2_RDF_CHECK) 1008 if (rgd->rd_flags & GFS2_RDF_CHECK)
1101 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1009 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1102 if (!rg_locked) 1010 if (!rg_locked)
1103 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1011 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1104 /* fall through */ 1012 /* fall through */
1105 case GLR_TRYFAILED: 1013 case GLR_TRYFAILED:
1106 rgd = recent_rgrp_next(rgd); 1014 rgd = gfs2_rgrpd_get_next(rgd);
1107 break; 1015 if (rgd == begin)
1108 1016 loops++;
1109 default:
1110 return error;
1111 }
1112 }
1113
1114 /* Go through full list of rgrps */
1115
1116 begin = rgd = forward_rgrp_get(sdp);
1117
1118 for (;;) {
1119 rg_locked = 0;
1120
1121 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1122 rg_locked = 1;
1123 error = 0;
1124 } else {
1125 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
1126 &al->al_rgd_gh);
1127 }
1128 switch (error) {
1129 case 0:
1130 if (try_rgrp_fit(rgd, al))
1131 goto out;
1132 if (rgd->rd_flags & GFS2_RDF_CHECK)
1133 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1134 if (!rg_locked)
1135 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1136 break;
1137
1138 case GLR_TRYFAILED:
1139 skipped++;
1140 break; 1017 break;
1141
1142 default: 1018 default:
1143 return error; 1019 return error;
1144 } 1020 }
1145
1146 rgd = gfs2_rgrpd_get_next(rgd);
1147 if (!rgd)
1148 rgd = gfs2_rgrpd_get_first(sdp);
1149
1150 if (rgd == begin) {
1151 if (++loops >= 3)
1152 return -ENOSPC;
1153 if (!skipped)
1154 loops++;
1155 flags = 0;
1156 if (loops == 2)
1157 gfs2_log_flush(sdp, NULL);
1158 }
1159 } 1021 }
1160 1022
1161out: 1023 return -ENOSPC;
1162 if (begin) {
1163 spin_lock(&sdp->sd_rindex_spin);
1164 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
1165 spin_unlock(&sdp->sd_rindex_spin);
1166 rgd = gfs2_rgrpd_get_next(rgd);
1167 if (!rgd)
1168 rgd = gfs2_rgrpd_get_first(sdp);
1169 forward_rgrp_set(sdp, rgd);
1170 }
1171
1172 return 0;
1173} 1024}
1174 1025
1175/** 1026/**
1176 * gfs2_inplace_reserve_i - Reserve space in the filesystem 1027 * gfs2_inplace_reserve - Reserve space in the filesystem
1177 * @ip: the inode to reserve space for 1028 * @ip: the inode to reserve space for
1178 * 1029 *
1179 * Returns: errno 1030 * Returns: errno
1180 */ 1031 */
1181 1032
1182int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, 1033int gfs2_inplace_reserve(struct gfs2_inode *ip)
1183 char *file, unsigned int line)
1184{ 1034{
1185 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1035 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1186 struct gfs2_alloc *al = ip->i_alloc; 1036 struct gfs2_alloc *al = ip->i_alloc;
@@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1191 if (gfs2_assert_warn(sdp, al->al_requested)) 1041 if (gfs2_assert_warn(sdp, al->al_requested))
1192 return -EINVAL; 1042 return -EINVAL;
1193 1043
1194 if (hold_rindex) {
1195 /* We need to hold the rindex unless the inode we're using is
1196 the rindex itself, in which case it's already held. */
1197 if (ip != GFS2_I(sdp->sd_rindex))
1198 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1199 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1200 in, so: */
1201 error = gfs2_ri_update(ip);
1202 if (error)
1203 return error;
1204 }
1205
1206try_again:
1207 do { 1044 do {
1208 error = get_local_rgrp(ip, &last_unlinked); 1045 error = get_local_rgrp(ip, &last_unlinked);
1209 /* If there is no space, flushing the log may release some */ 1046 if (error != -ENOSPC)
1210 if (error) { 1047 break;
1211 if (ip == GFS2_I(sdp->sd_rindex) && 1048 /* Check that fs hasn't grown if writing to rindex */
1212 !sdp->sd_rindex_uptodate) { 1049 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
1213 error = gfs2_ri_update(ip); 1050 error = gfs2_ri_update(ip);
1214 if (error) 1051 if (error)
1215 return error; 1052 break;
1216 goto try_again; 1053 continue;
1217 }
1218 gfs2_log_flush(sdp, NULL);
1219 } 1054 }
1220 } while (error && tries++ < 3); 1055 /* Flushing the log may release space */
1221 1056 gfs2_log_flush(sdp, NULL);
1222 if (error) { 1057 } while (tries++ < 3);
1223 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1224 gfs2_glock_dq_uninit(&al->al_ri_gh);
1225 return error;
1226 }
1227
1228 /* no error, so we have the rgrp set in the inode's allocation. */
1229 al->al_file = file;
1230 al->al_line = line;
1231 1058
1232 return 0; 1059 return error;
1233} 1060}
1234 1061
1235/** 1062/**
@@ -1241,20 +1068,10 @@ try_again:
1241 1068
1242void gfs2_inplace_release(struct gfs2_inode *ip) 1069void gfs2_inplace_release(struct gfs2_inode *ip)
1243{ 1070{
1244 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1245 struct gfs2_alloc *al = ip->i_alloc; 1071 struct gfs2_alloc *al = ip->i_alloc;
1246 1072
1247 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1248 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1249 "al_file = %s, al_line = %u\n",
1250 al->al_alloced, al->al_requested, al->al_file,
1251 al->al_line);
1252
1253 al->al_rgd = NULL;
1254 if (al->al_rgd_gh.gh_gl) 1073 if (al->al_rgd_gh.gh_gl)
1255 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1074 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1256 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1257 gfs2_glock_dq_uninit(&al->al_ri_gh);
1258} 1075}
1259 1076
1260/** 1077/**
@@ -1352,6 +1169,7 @@ do_search:
1352 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1169 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1353 bitmaps, so we must search the originals for that. */ 1170 bitmaps, so we must search the originals for that. */
1354 buffer = bi->bi_bh->b_data + bi->bi_offset; 1171 buffer = bi->bi_bh->b_data + bi->bi_offset;
1172 WARN_ON(!buffer_uptodate(bi->bi_bh));
1355 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1173 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1356 buffer = bi->bi_clone + bi->bi_offset; 1174 buffer = bi->bi_clone + bi->bi_offset;
1357 1175
@@ -1371,6 +1189,7 @@ skip:
1371 1189
1372 if (blk == BFITNOENT) 1190 if (blk == BFITNOENT)
1373 return blk; 1191 return blk;
1192
1374 *n = 1; 1193 *n = 1;
1375 if (old_state == new_state) 1194 if (old_state == new_state)
1376 goto out; 1195 goto out;
@@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1503 if (al == NULL) 1322 if (al == NULL)
1504 return -ECANCELED; 1323 return -ECANCELED;
1505 1324
1506 rgd = al->al_rgd; 1325 rgd = ip->i_rgd;
1507 1326
1508 if (rgrp_contains_block(rgd, ip->i_goal)) 1327 if (rgrp_contains_block(rgd, ip->i_goal))
1509 goal = ip->i_goal - rgd->rd_data0; 1328 goal = ip->i_goal - rgd->rd_data0;
@@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1518 1337
1519 rgd->rd_last_alloc = blk; 1338 rgd->rd_last_alloc = blk;
1520 block = rgd->rd_data0 + blk; 1339 block = rgd->rd_data0 + blk;
1521 ip->i_goal = block; 1340 ip->i_goal = block + *n - 1;
1522 error = gfs2_meta_inode_buffer(ip, &dibh); 1341 error = gfs2_meta_inode_buffer(ip, &dibh);
1523 if (error == 0) { 1342 if (error == 0) {
1524 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 1343 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
@@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1539 gfs2_statfs_change(sdp, 0, -(s64)*n, 0); 1358 gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
1540 gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); 1359 gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
1541 1360
1542 spin_lock(&sdp->sd_rindex_spin);
1543 rgd->rd_free_clone -= *n; 1361 rgd->rd_free_clone -= *n;
1544 spin_unlock(&sdp->sd_rindex_spin);
1545 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); 1362 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
1546 *bn = block; 1363 *bn = block;
1547 return 0; 1364 return 0;
@@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1564{ 1381{
1565 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1382 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1566 struct gfs2_alloc *al = dip->i_alloc; 1383 struct gfs2_alloc *al = dip->i_alloc;
1567 struct gfs2_rgrpd *rgd = al->al_rgd; 1384 struct gfs2_rgrpd *rgd = dip->i_rgd;
1568 u32 blk; 1385 u32 blk;
1569 u64 block; 1386 u64 block;
1570 unsigned int n = 1; 1387 unsigned int n = 1;
@@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1594 gfs2_statfs_change(sdp, 0, -1, +1); 1411 gfs2_statfs_change(sdp, 0, -1, +1);
1595 gfs2_trans_add_unrevoke(sdp, block, 1); 1412 gfs2_trans_add_unrevoke(sdp, block, 1);
1596 1413
1597 spin_lock(&sdp->sd_rindex_spin);
1598 rgd->rd_free_clone--; 1414 rgd->rd_free_clone--;
1599 spin_unlock(&sdp->sd_rindex_spin);
1600 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); 1415 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1601 *bn = block; 1416 *bn = block;
1602 return 0; 1417 return 0;
@@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
1629 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1444 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1630 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1445 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1631 1446
1632 gfs2_trans_add_rg(rgd);
1633
1634 /* Directories keep their data in the metadata address space */ 1447 /* Directories keep their data in the metadata address space */
1635 if (meta || ip->i_depth) 1448 if (meta || ip->i_depth)
1636 gfs2_meta_wipe(ip, bstart, blen); 1449 gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode)
1666 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); 1479 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
1667 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1480 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1668 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1481 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1669 gfs2_trans_add_rg(rgd);
1670} 1482}
1671 1483
1672static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) 1484static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1688 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1500 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1689 1501
1690 gfs2_statfs_change(sdp, 0, +1, -1); 1502 gfs2_statfs_change(sdp, 0, +1, -1);
1691 gfs2_trans_add_rg(rgd);
1692} 1503}
1693 1504
1694 1505
@@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1714int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) 1525int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1715{ 1526{
1716 struct gfs2_rgrpd *rgd; 1527 struct gfs2_rgrpd *rgd;
1717 struct gfs2_holder ri_gh, rgd_gh; 1528 struct gfs2_holder rgd_gh;
1718 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1719 int ri_locked = 0;
1720 int error; 1529 int error;
1721 1530
1722 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { 1531 error = gfs2_rindex_update(sdp);
1723 error = gfs2_rindex_hold(sdp, &ri_gh); 1532 if (error)
1724 if (error) 1533 return error;
1725 goto fail;
1726 ri_locked = 1;
1727 }
1728 1534
1729 error = -EINVAL; 1535 error = -EINVAL;
1730 rgd = gfs2_blk2rgrpd(sdp, no_addr); 1536 rgd = gfs2_blk2rgrpd(sdp, no_addr);
1731 if (!rgd) 1537 if (!rgd)
1732 goto fail_rindex; 1538 goto fail;
1733 1539
1734 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); 1540 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
1735 if (error) 1541 if (error)
1736 goto fail_rindex; 1542 goto fail;
1737 1543
1738 if (gfs2_get_block_type(rgd, no_addr) != type) 1544 if (gfs2_get_block_type(rgd, no_addr) != type)
1739 error = -ESTALE; 1545 error = -ESTALE;
1740 1546
1741 gfs2_glock_dq_uninit(&rgd_gh); 1547 gfs2_glock_dq_uninit(&rgd_gh);
1742fail_rindex:
1743 if (ri_locked)
1744 gfs2_glock_dq_uninit(&ri_gh);
1745fail: 1548fail:
1746 return error; 1549 return error;
1747} 1550}
1748 1551
1749/** 1552/**
1750 * gfs2_rlist_add - add a RG to a list of RGs 1553 * gfs2_rlist_add - add a RG to a list of RGs
1751 * @sdp: the filesystem 1554 * @ip: the inode
1752 * @rlist: the list of resource groups 1555 * @rlist: the list of resource groups
1753 * @block: the block 1556 * @block: the block
1754 * 1557 *
@@ -1758,9 +1561,10 @@ fail:
1758 * 1561 *
1759 */ 1562 */
1760 1563
1761void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 1564void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
1762 u64 block) 1565 u64 block)
1763{ 1566{
1567 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1764 struct gfs2_rgrpd *rgd; 1568 struct gfs2_rgrpd *rgd;
1765 struct gfs2_rgrpd **tmp; 1569 struct gfs2_rgrpd **tmp;
1766 unsigned int new_space; 1570 unsigned int new_space;
@@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1769 if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) 1573 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1770 return; 1574 return;
1771 1575
1772 rgd = gfs2_blk2rgrpd(sdp, block); 1576 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
1577 rgd = ip->i_rgd;
1578 else
1579 rgd = gfs2_blk2rgrpd(sdp, block);
1773 if (!rgd) { 1580 if (!rgd) {
1774 if (gfs2_consist(sdp)) 1581 fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
1775 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1776 return; 1582 return;
1777 } 1583 }
1584 ip->i_rgd = rgd;
1778 1585
1779 for (x = 0; x < rlist->rl_rgrps; x++) 1586 for (x = 0; x < rlist->rl_rgrps; x++)
1780 if (rlist->rl_rgd[x] == rgd) 1587 if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70..cf5c5018019 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,18 +18,15 @@ struct gfs2_holder;
18 18
19extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); 19extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
20 20
21struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); 21extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
22struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); 22extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
23struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); 23extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
24 24
25extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); 25extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
26extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); 26extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
27 27extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
28extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); 28extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
29extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); 29extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
30extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
31
32extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
33 30
34extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 31extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
35static inline void gfs2_alloc_put(struct gfs2_inode *ip) 32static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 36 ip->i_alloc = NULL;
40} 37}
41 38
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, 39extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \
45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
48
49extern void gfs2_inplace_release(struct gfs2_inode *ip); 40extern void gfs2_inplace_release(struct gfs2_inode *ip);
50 41
51extern int gfs2_ri_update(struct gfs2_inode *ip);
52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 42extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 43extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
54 44
@@ -66,11 +56,14 @@ struct gfs2_rgrp_list {
66 struct gfs2_holder *rl_ghs; 56 struct gfs2_holder *rl_ghs;
67}; 57};
68 58
69extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 59extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
70 u64 block); 60 u64 block);
71extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 61extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
72extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 62extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
73extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 63extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
74extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 64extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
65extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
66 struct buffer_head *bh,
67 const struct gfs2_bitmap *bi);
75 68
76#endif /* __RGRP_DOT_H__ */ 69#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4..71e420989f7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
752 struct gfs2_sbd *sdp = GFS2_SB(inode); 752 struct gfs2_sbd *sdp = GFS2_SB(inode);
753 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); 753 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
754 struct backing_dev_info *bdi = metamapping->backing_dev_info; 754 struct backing_dev_info *bdi = metamapping->backing_dev_info;
755 struct gfs2_holder gh; 755 int ret = 0;
756
757 if (wbc->sync_mode == WB_SYNC_ALL)
758 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
759 if (bdi->dirty_exceeded)
760 gfs2_ail1_flush(sdp, wbc);
761 else
762 filemap_fdatawrite(metamapping);
763 if (wbc->sync_mode == WB_SYNC_ALL)
764 ret = filemap_fdatawait(metamapping);
765 if (ret)
766 mark_inode_dirty_sync(inode);
767 return ret;
768}
769
770/**
771 * gfs2_dirty_inode - check for atime updates
772 * @inode: The inode in question
773 * @flags: The type of dirty
774 *
775 * Unfortunately it can be called under any combination of inode
776 * glock and transaction lock, so we have to check carefully.
777 *
778 * At the moment this deals only with atime - it should be possible
779 * to expand that role in future, once a review of the locking has
780 * been carried out.
781 */
782
783static void gfs2_dirty_inode(struct inode *inode, int flags)
784{
785 struct gfs2_inode *ip = GFS2_I(inode);
786 struct gfs2_sbd *sdp = GFS2_SB(inode);
756 struct buffer_head *bh; 787 struct buffer_head *bh;
757 struct timespec atime; 788 struct gfs2_holder gh;
758 struct gfs2_dinode *di; 789 int need_unlock = 0;
759 int ret = -EAGAIN; 790 int need_endtrans = 0;
760 int unlock_required = 0; 791 int ret;
761 792
762 /* Skip timestamp update, if this is from a memalloc */ 793 if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
763 if (current->flags & PF_MEMALLOC) 794 return;
764 goto do_flush; 795
765 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { 796 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
766 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 797 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
767 if (ret) 798 if (ret) {
768 goto do_flush; 799 fs_err(sdp, "dirty_inode: glock %d\n", ret);
769 unlock_required = 1; 800 return;
801 }
802 need_unlock = 1;
770 } 803 }
771 ret = gfs2_trans_begin(sdp, RES_DINODE, 0); 804
772 if (ret) 805 if (current->journal_info == NULL) {
773 goto do_unlock; 806 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
807 if (ret) {
808 fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
809 goto out;
810 }
811 need_endtrans = 1;
812 }
813
774 ret = gfs2_meta_inode_buffer(ip, &bh); 814 ret = gfs2_meta_inode_buffer(ip, &bh);
775 if (ret == 0) { 815 if (ret == 0) {
776 di = (struct gfs2_dinode *)bh->b_data; 816 gfs2_trans_add_bh(ip->i_gl, bh, 1);
777 atime.tv_sec = be64_to_cpu(di->di_atime); 817 gfs2_dinode_out(ip, bh->b_data);
778 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
779 if (timespec_compare(&inode->i_atime, &atime) > 0) {
780 gfs2_trans_add_bh(ip->i_gl, bh, 1);
781 gfs2_dinode_out(ip, bh->b_data);
782 }
783 brelse(bh); 818 brelse(bh);
784 } 819 }
785 gfs2_trans_end(sdp); 820
786do_unlock: 821 if (need_endtrans)
787 if (unlock_required) 822 gfs2_trans_end(sdp);
823out:
824 if (need_unlock)
788 gfs2_glock_dq_uninit(&gh); 825 gfs2_glock_dq_uninit(&gh);
789do_flush:
790 if (wbc->sync_mode == WB_SYNC_ALL)
791 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
792 filemap_fdatawrite(metamapping);
793 if (bdi->dirty_exceeded)
794 gfs2_ail1_flush(sdp, wbc);
795 if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
796 ret = filemap_fdatawait(metamapping);
797 if (ret)
798 mark_inode_dirty_sync(inode);
799 return ret;
800} 826}
801 827
802/** 828/**
@@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
1011 1037
1012static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) 1038static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
1013{ 1039{
1014 struct gfs2_holder ri_gh;
1015 struct gfs2_rgrpd *rgd_next; 1040 struct gfs2_rgrpd *rgd_next;
1016 struct gfs2_holder *gha, *gh; 1041 struct gfs2_holder *gha, *gh;
1017 unsigned int slots = 64; 1042 unsigned int slots = 64;
@@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
1024 if (!gha) 1049 if (!gha)
1025 return -ENOMEM; 1050 return -ENOMEM;
1026 1051
1027 error = gfs2_rindex_hold(sdp, &ri_gh);
1028 if (error)
1029 goto out;
1030
1031 rgd_next = gfs2_rgrpd_get_first(sdp); 1052 rgd_next = gfs2_rgrpd_get_first(sdp);
1032 1053
1033 for (;;) { 1054 for (;;) {
@@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
1070 yield(); 1091 yield();
1071 } 1092 }
1072 1093
1073 gfs2_glock_dq_uninit(&ri_gh);
1074
1075out:
1076 kfree(gha); 1094 kfree(gha);
1077 return error; 1095 return error;
1078} 1096}
@@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1124 struct gfs2_statfs_change_host sc; 1142 struct gfs2_statfs_change_host sc;
1125 int error; 1143 int error;
1126 1144
1145 error = gfs2_rindex_update(sdp);
1146 if (error)
1147 return error;
1148
1127 if (gfs2_tune_get(sdp, gt_statfs_slow)) 1149 if (gfs2_tune_get(sdp, gt_statfs_slow))
1128 error = gfs2_statfs_slow(sdp, &sc); 1150 error = gfs2_statfs_slow(sdp, &sc);
1129 else 1151 else
@@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
1394 if (error) 1416 if (error)
1395 goto out; 1417 goto out;
1396 1418
1397 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1398 if (error)
1399 goto out_qs;
1400
1401 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
1402 if (!rgd) { 1420 if (!rgd) {
1403 gfs2_consist_inode(ip); 1421 gfs2_consist_inode(ip);
1404 error = -EIO; 1422 error = -EIO;
1405 goto out_rindex_relse; 1423 goto out_qs;
1406 } 1424 }
1407 1425
1408 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, 1426 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1409 &al->al_rgd_gh); 1427 &al->al_rgd_gh);
1410 if (error) 1428 if (error)
1411 goto out_rindex_relse; 1429 goto out_qs;
1412 1430
1413 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1431 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
1414 sdp->sd_jdesc->jd_blocks); 1432 sdp->sd_jdesc->jd_blocks);
@@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
1423 1441
1424out_rg_gunlock: 1442out_rg_gunlock:
1425 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1443 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1426out_rindex_relse:
1427 gfs2_glock_dq_uninit(&al->al_ri_gh);
1428out_qs: 1444out_qs:
1429 gfs2_quota_unhold(ip); 1445 gfs2_quota_unhold(ip);
1430out: 1446out:
@@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode)
1471 goto out; 1487 goto out;
1472 } 1488 }
1473 1489
1474 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); 1490 if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
1475 if (error) 1491 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1476 goto out_truncate; 1492 if (error)
1493 goto out_truncate;
1494 }
1477 1495
1478 if (test_bit(GIF_INVALID, &ip->i_flags)) { 1496 if (test_bit(GIF_INVALID, &ip->i_flags)) {
1479 error = gfs2_inode_refresh(ip); 1497 error = gfs2_inode_refresh(ip);
@@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode)
1513 goto out_unlock; 1531 goto out_unlock;
1514 1532
1515out_truncate: 1533out_truncate:
1534 gfs2_log_flush(sdp, ip->i_gl);
1535 write_inode_now(inode, 1);
1536 gfs2_ail_flush(ip->i_gl, 0);
1537
1516 /* Case 2 starts here */ 1538 /* Case 2 starts here */
1517 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1539 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1518 if (error) 1540 if (error)
@@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
1552 if (ip) { 1574 if (ip) {
1553 ip->i_flags = 0; 1575 ip->i_flags = 0;
1554 ip->i_gl = NULL; 1576 ip->i_gl = NULL;
1577 ip->i_rgd = NULL;
1555 } 1578 }
1556 return &ip->i_inode; 1579 return &ip->i_inode;
1557} 1580}
@@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = {
1572 .alloc_inode = gfs2_alloc_inode, 1595 .alloc_inode = gfs2_alloc_inode,
1573 .destroy_inode = gfs2_destroy_inode, 1596 .destroy_inode = gfs2_destroy_inode,
1574 .write_inode = gfs2_write_inode, 1597 .write_inode = gfs2_write_inode,
1598 .dirty_inode = gfs2_dirty_inode,
1575 .evict_inode = gfs2_evict_inode, 1599 .evict_inode = gfs2_evict_inode,
1576 .put_super = gfs2_put_super, 1600 .put_super = gfs2_put_super,
1577 .sync_fs = gfs2_sync_fs, 1601 .sync_fs = gfs2_sync_fs,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a85411..86ac75d99d3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
185 gfs2_log_unlock(sdp); 185 gfs2_log_unlock(sdp);
186} 186}
187 187
188void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
189{
190 lops_add(rgd->rd_sbd, &rgd->rd_le);
191}
192
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e02..f8f101ef600 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,20 +28,20 @@ struct gfs2_glock;
28 28
29/* reserve either the number of blocks to be allocated plus the rg header 29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */ 30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) 31static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
32{ 32{
33 return (al->al_requested < al->al_rgd->rd_length)? 33 const struct gfs2_alloc *al = ip->i_alloc;
34 al->al_requested + 1 : al->al_rgd->rd_length; 34 if (al->al_requested < ip->i_rgd->rd_length)
35 return al->al_requested + 1;
36 return ip->i_rgd->rd_length;
35} 37}
36 38
37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 39extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
38 unsigned int revokes); 40 unsigned int revokes);
39 41
40void gfs2_trans_end(struct gfs2_sbd *sdp); 42extern void gfs2_trans_end(struct gfs2_sbd *sdp);
41 43extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
42void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 44extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
43void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); 45extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
44void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
45void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
46 46
47#endif /* __TRANS_DOT_H__ */ 47#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c0326..71d7bf830c0 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
332 if (error) 332 if (error)
333 goto out_alloc; 333 goto out_alloc;
334 334
335 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
336 if (error)
337 goto out_quota;
338
339 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); 335 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
340 336
341 gfs2_glock_dq_uninit(&al->al_ri_gh);
342
343out_quota:
344 gfs2_quota_unhold(ip); 337 gfs2_quota_unhold(ip);
345out_alloc: 338out_alloc:
346 gfs2_alloc_put(ip); 339 gfs2_alloc_put(ip);
@@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 727 goto out_gunlock_q;
735 728
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 729 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + gfs2_rg_blocks(al) + 730 blks + gfs2_rg_blocks(ip) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 731 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 732 if (error)
740 goto out_ipres; 733 goto out_ipres;
@@ -1296,7 +1289,8 @@ fail:
1296 1289
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1290int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1291{
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1292 struct inode *inode = &ip->i_inode;
1293 struct gfs2_sbd *sdp = GFS2_SB(inode);
1300 struct gfs2_ea_location el; 1294 struct gfs2_ea_location el;
1301 int error; 1295 int error;
1302 1296
@@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1319 if (error) 1313 if (error)
1320 return error; 1314 return error;
1321 1315
1322 error = gfs2_setattr_simple(ip, attr); 1316 error = gfs2_setattr_simple(inode, attr);
1323 gfs2_trans_end(sdp); 1317 gfs2_trans_end(sdp);
1324 return error; 1318 return error;
1325} 1319}
@@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1362 blen++; 1356 blen++;
1363 else { 1357 else {
1364 if (bstart) 1358 if (bstart)
1365 gfs2_rlist_add(sdp, &rlist, bstart); 1359 gfs2_rlist_add(ip, &rlist, bstart);
1366 bstart = bn; 1360 bstart = bn;
1367 blen = 1; 1361 blen = 1;
1368 } 1362 }
1369 blks++; 1363 blks++;
1370 } 1364 }
1371 if (bstart) 1365 if (bstart)
1372 gfs2_rlist_add(sdp, &rlist, bstart); 1366 gfs2_rlist_add(ip, &rlist, bstart);
1373 else 1367 else
1374 goto out; 1368 goto out;
1375 1369
@@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1501 if (error) 1495 if (error)
1502 goto out_alloc; 1496 goto out_alloc;
1503 1497
1504 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1505 if (error)
1506 goto out_quota;
1507
1508 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); 1498 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1509 if (error) 1499 if (error)
1510 goto out_rindex; 1500 goto out_quota;
1511 1501
1512 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { 1502 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
1513 error = ea_dealloc_indirect(ip); 1503 error = ea_dealloc_indirect(ip);
1514 if (error) 1504 if (error)
1515 goto out_rindex; 1505 goto out_quota;
1516 } 1506 }
1517 1507
1518 error = ea_dealloc_block(ip); 1508 error = ea_dealloc_block(ip);
1519 1509
1520out_rindex:
1521 gfs2_glock_dq_uninit(&al->al_ri_gh);
1522out_quota: 1510out_quota:
1523 gfs2_quota_unhold(ip); 1511 gfs2_quota_unhold(ip);
1524out_alloc: 1512out_alloc:
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 3ebc437736f..1cbdeea1db4 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
46 case HFS_EXT_CNID: 46 case HFS_EXT_CNID:
47 hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, 47 hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
48 mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); 48 mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
49 if (HFS_I(tree->inode)->alloc_blocks >
50 HFS_I(tree->inode)->first_blocks) {
51 printk(KERN_ERR "hfs: invalid btree extent records\n");
52 unlock_new_inode(tree->inode);
53 goto free_inode;
54 }
55
49 tree->inode->i_mapping->a_ops = &hfs_btree_aops; 56 tree->inode->i_mapping->a_ops = &hfs_btree_aops;
50 break; 57 break;
51 case HFS_CAT_CNID: 58 case HFS_CAT_CNID:
52 hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, 59 hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
53 mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); 60 mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
61
62 if (!HFS_I(tree->inode)->first_blocks) {
63 printk(KERN_ERR "hfs: invalid btree extent records "
64 "(0 size).\n");
65 unlock_new_inode(tree->inode);
66 goto free_inode;
67 }
68
54 tree->inode->i_mapping->a_ops = &hfs_btree_aops; 69 tree->inode->i_mapping->a_ops = &hfs_btree_aops;
55 break; 70 break;
56 default: 71 default:
@@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
59 } 74 }
60 unlock_new_inode(tree->inode); 75 unlock_new_inode(tree->inode);
61 76
62 if (!HFS_I(tree->inode)->first_blocks) {
63 printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
64 goto free_inode;
65 }
66
67 mapping = tree->inode->i_mapping; 77 mapping = tree->inode->i_mapping;
68 page = read_mapping_page(mapping, 0, NULL); 78 page = read_mapping_page(mapping, 0, NULL);
69 if (IS_ERR(page)) 79 if (IS_ERR(page))
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be9..bce4eef91a0 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
198 198
199 res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); 199 res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
200 if (res) { 200 if (res) {
201 inode->i_nlink = 0; 201 clear_nlink(inode);
202 hfs_delete_inode(inode); 202 hfs_delete_inode(inode);
203 iput(inode); 203 iput(inode);
204 return res; 204 return res;
@@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
227 227
228 res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); 228 res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
229 if (res) { 229 if (res) {
230 inode->i_nlink = 0; 230 clear_nlink(inode);
231 hfs_delete_inode(inode); 231 hfs_delete_inode(inode);
232 iput(inode); 232 iput(inode);
233 return res; 233 return res;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 96a1b625fc7..a1a9fdcd2a0 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
183 inode->i_mode = mode; 183 inode->i_mode = mode;
184 inode->i_uid = current_fsuid(); 184 inode->i_uid = current_fsuid();
185 inode->i_gid = current_fsgid(); 185 inode->i_gid = current_fsgid();
186 inode->i_nlink = 1; 186 set_nlink(inode, 1);
187 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 187 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
188 HFS_I(inode)->flags = 0; 188 HFS_I(inode)->flags = 0;
189 HFS_I(inode)->rsrc_inode = NULL; 189 HFS_I(inode)->rsrc_inode = NULL;
@@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
313 /* Initialize the inode */ 313 /* Initialize the inode */
314 inode->i_uid = hsb->s_uid; 314 inode->i_uid = hsb->s_uid;
315 inode->i_gid = hsb->s_gid; 315 inode->i_gid = hsb->s_gid;
316 inode->i_nlink = 1; 316 set_nlink(inode, 1);
317 317
318 if (idata->key) 318 if (idata->key)
319 HFS_I(inode)->cat_key = *idata->key; 319 HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 25b2443a004..4536cd3f15a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
415 goto out; 415 goto out;
416 416
417out_err: 417out_err:
418 inode->i_nlink = 0; 418 clear_nlink(inode);
419 hfsplus_delete_inode(inode); 419 hfsplus_delete_inode(inode);
420 iput(inode); 420 iput(inode);
421out: 421out:
@@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
440 440
441 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 441 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
442 if (res) { 442 if (res) {
443 inode->i_nlink = 0; 443 clear_nlink(inode);
444 hfsplus_delete_inode(inode); 444 hfsplus_delete_inode(inode);
445 iput(inode); 445 iput(inode);
446 goto out; 446 goto out;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4cc1e3a36ec..40e1413be4c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
391 inode->i_mode = mode; 391 inode->i_mode = mode;
392 inode->i_uid = current_fsuid(); 392 inode->i_uid = current_fsuid();
393 inode->i_gid = current_fsgid(); 393 inode->i_gid = current_fsgid();
394 inode->i_nlink = 1; 394 set_nlink(inode, 1);
395 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 395 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
396 396
397 hip = HFSPLUS_I(inode); 397 hip = HFSPLUS_I(inode);
@@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
512 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, 512 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
513 sizeof(struct hfsplus_cat_folder)); 513 sizeof(struct hfsplus_cat_folder));
514 hfsplus_get_perms(inode, &folder->permissions, 1); 514 hfsplus_get_perms(inode, &folder->permissions, 1);
515 inode->i_nlink = 1; 515 set_nlink(inode, 1);
516 inode->i_size = 2 + be32_to_cpu(folder->valence); 516 inode->i_size = 2 + be32_to_cpu(folder->valence);
517 inode->i_atime = hfsp_mt2ut(folder->access_date); 517 inode->i_atime = hfsp_mt2ut(folder->access_date);
518 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 518 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
@@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
532 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? 532 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
533 &file->rsrc_fork : &file->data_fork); 533 &file->rsrc_fork : &file->data_fork);
534 hfsplus_get_perms(inode, &file->permissions, 0); 534 hfsplus_get_perms(inode, &file->permissions, 0);
535 inode->i_nlink = 1; 535 set_nlink(inode, 1);
536 if (S_ISREG(inode->i_mode)) { 536 if (S_ISREG(inode->i_mode)) {
537 if (file->permissions.dev) 537 if (file->permissions.dev)
538 inode->i_nlink = 538 set_nlink(inode,
539 be32_to_cpu(file->permissions.dev); 539 be32_to_cpu(file->permissions.dev));
540 inode->i_op = &hfsplus_file_inode_operations; 540 inode->i_op = &hfsplus_file_inode_operations;
541 inode->i_fop = &hfsplus_file_operations; 541 inode->i_fop = &hfsplus_file_operations;
542 inode->i_mapping->a_ops = &hfsplus_aops; 542 inode->i_mapping->a_ops = &hfsplus_aops;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0d22afdd461..2f72da5ae68 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name)
541 541
542 ino->i_ino = st.ino; 542 ino->i_ino = st.ino;
543 ino->i_mode = st.mode; 543 ino->i_mode = st.mode;
544 ino->i_nlink = st.nlink; 544 set_nlink(ino, st.nlink);
545 ino->i_uid = st.uid; 545 ino->i_uid = st.uid;
546 ino->i_gid = st.gid; 546 ino->i_gid = st.gid;
547 ino->i_atime = st.atime; 547 ino->i_atime = st.atime;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index d51a98384bc..dd7bc38a382 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -16,7 +16,6 @@
16#include <sys/vfs.h> 16#include <sys/vfs.h>
17#include "hostfs.h" 17#include "hostfs.h"
18#include "os.h" 18#include "os.h"
19#include "user.h"
20#include <utime.h> 19#include <utime.h>
21 20
22static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) 21static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 96a8ed91ced..2fa0089a02a 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
247 result->i_mode &= ~0111; 247 result->i_mode &= ~0111;
248 result->i_op = &hpfs_file_iops; 248 result->i_op = &hpfs_file_iops;
249 result->i_fop = &hpfs_file_ops; 249 result->i_fop = &hpfs_file_ops;
250 result->i_nlink = 1; 250 set_nlink(result, 1);
251 } 251 }
252 unlock_new_inode(result); 252 unlock_new_inode(result);
253 } 253 }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 331b5e234ef..de946170ebb 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
311 311
312/* super.c */ 312/* super.c */
313 313
314void hpfs_error(struct super_block *, const char *, ...) 314__printf(2, 3)
315 __attribute__((format (printf, 2, 3))); 315void hpfs_error(struct super_block *, const char *, ...);
316int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); 316int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
317unsigned hpfs_count_one_bitmap(struct super_block *, secno); 317unsigned hpfs_count_one_bitmap(struct super_block *, secno);
318 318
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 338cd836845..3b2cec29972 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i)
53 i->i_mode &= ~0111; 53 i->i_mode &= ~0111;
54 i->i_op = &hpfs_file_iops; 54 i->i_op = &hpfs_file_iops;
55 i->i_fop = &hpfs_file_ops; 55 i->i_fop = &hpfs_file_ops;
56 i->i_nlink = 0;*/ 56 clear_nlink(i);*/
57 make_bad_inode(i); 57 make_bad_inode(i);
58 return; 58 return;
59 } 59 }
@@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i)
77 i->i_mode = S_IFLNK | 0777; 77 i->i_mode = S_IFLNK | 0777;
78 i->i_op = &page_symlink_inode_operations; 78 i->i_op = &page_symlink_inode_operations;
79 i->i_data.a_ops = &hpfs_symlink_aops; 79 i->i_data.a_ops = &hpfs_symlink_aops;
80 i->i_nlink = 1; 80 set_nlink(i, 1);
81 i->i_size = ea_size; 81 i->i_size = ea_size;
82 i->i_blocks = 1; 82 i->i_blocks = 1;
83 brelse(bh); 83 brelse(bh);
@@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i)
101 } 101 }
102 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { 102 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
103 brelse(bh); 103 brelse(bh);
104 i->i_nlink = 1; 104 set_nlink(i, 1);
105 i->i_size = 0; 105 i->i_size = 0;
106 i->i_blocks = 1; 106 i->i_blocks = 1;
107 init_special_inode(i, mode, 107 init_special_inode(i, mode,
@@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i)
125 hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); 125 hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
126 i->i_blocks = 4 * n_dnodes; 126 i->i_blocks = 4 * n_dnodes;
127 i->i_size = 2048 * n_dnodes; 127 i->i_size = 2048 * n_dnodes;
128 i->i_nlink = 2 + n_subdirs; 128 set_nlink(i, 2 + n_subdirs);
129 } else { 129 } else {
130 i->i_mode |= S_IFREG; 130 i->i_mode |= S_IFREG;
131 if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; 131 if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
132 i->i_op = &hpfs_file_iops; 132 i->i_op = &hpfs_file_iops;
133 i->i_fop = &hpfs_file_ops; 133 i->i_fop = &hpfs_file_ops;
134 i->i_nlink = 1; 134 set_nlink(i, 1);
135 i->i_size = le32_to_cpu(fnode->file_size); 135 i->i_size = le32_to_cpu(fnode->file_size);
136 i->i_blocks = ((i->i_size + 511) >> 9) + 1; 136 i->i_blocks = ((i->i_size + 511) >> 9) + 1;
137 i->i_data.a_ops = &hpfs_aops; 137 i->i_data.a_ops = &hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 2df69e2f07c..ea91fcb0ef9 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
56 result->i_fop = &hpfs_dir_ops; 56 result->i_fop = &hpfs_dir_ops;
57 result->i_blocks = 4; 57 result->i_blocks = 4;
58 result->i_size = 2048; 58 result->i_size = 2048;
59 result->i_nlink = 2; 59 set_nlink(result, 2);
60 if (dee.read_only) 60 if (dee.read_only)
61 result->i_mode &= ~0222; 61 result->i_mode &= ~0222;
62 62
@@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
150 result->i_mode &= ~0111; 150 result->i_mode &= ~0111;
151 result->i_op = &hpfs_file_iops; 151 result->i_op = &hpfs_file_iops;
152 result->i_fop = &hpfs_file_ops; 152 result->i_fop = &hpfs_file_ops;
153 result->i_nlink = 1; 153 set_nlink(result, 1);
154 hpfs_i(result)->i_parent_dir = dir->i_ino; 154 hpfs_i(result)->i_parent_dir = dir->i_ino;
155 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); 155 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
156 result->i_ctime.tv_nsec = 0; 156 result->i_ctime.tv_nsec = 0;
@@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
242 hpfs_i(result)->i_ea_size = 0; 242 hpfs_i(result)->i_ea_size = 0;
243 result->i_uid = current_fsuid(); 243 result->i_uid = current_fsuid();
244 result->i_gid = current_fsgid(); 244 result->i_gid = current_fsgid();
245 result->i_nlink = 1; 245 set_nlink(result, 1);
246 result->i_size = 0; 246 result->i_size = 0;
247 result->i_blocks = 1; 247 result->i_blocks = 1;
248 init_special_inode(result, mode, rdev); 248 init_special_inode(result, mode, rdev);
@@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
318 result->i_uid = current_fsuid(); 318 result->i_uid = current_fsuid();
319 result->i_gid = current_fsgid(); 319 result->i_gid = current_fsgid();
320 result->i_blocks = 1; 320 result->i_blocks = 1;
321 result->i_nlink = 1; 321 set_nlink(result, 1);
322 result->i_size = strlen(symlink); 322 result->i_size = strlen(symlink);
323 result->i_op = &page_symlink_inode_operations; 323 result->i_op = &page_symlink_inode_operations;
324 result->i_data.a_ops = &hpfs_symlink_aops; 324 result->i_data.a_ops = &hpfs_symlink_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 970ea987b3f..f590b1160c6 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -702,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
702 inode->i_ctime = proc_ino->i_ctime; 702 inode->i_ctime = proc_ino->i_ctime;
703 inode->i_ino = proc_ino->i_ino; 703 inode->i_ino = proc_ino->i_ino;
704 inode->i_mode = proc_ino->i_mode; 704 inode->i_mode = proc_ino->i_mode;
705 inode->i_nlink = proc_ino->i_nlink; 705 set_nlink(inode, proc_ino->i_nlink);
706 inode->i_size = proc_ino->i_size; 706 inode->i_size = proc_ino->i_size;
707 inode->i_blocks = proc_ino->i_blocks; 707 inode->i_blocks = proc_ino->i_blocks;
708 708
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec889538e5a..0be5a78598d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -970,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
970 970
971 d_instantiate(path.dentry, inode); 971 d_instantiate(path.dentry, inode);
972 inode->i_size = size; 972 inode->i_size = size;
973 inode->i_nlink = 0; 973 clear_nlink(inode);
974 974
975 error = -ENFILE; 975 error = -ENFILE;
976 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 976 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
diff --git a/fs/inode.c b/fs/inode.c
index ec7924696a1..ee4e66b998f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,7 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
142 atomic_set(&inode->i_count, 1); 142 atomic_set(&inode->i_count, 1);
143 inode->i_op = &empty_iops; 143 inode->i_op = &empty_iops;
144 inode->i_fop = &empty_fops; 144 inode->i_fop = &empty_fops;
145 inode->i_nlink = 1; 145 inode->__i_nlink = 1;
146 inode->i_opflags = 0; 146 inode->i_opflags = 0;
147 inode->i_uid = 0; 147 inode->i_uid = 0;
148 inode->i_gid = 0; 148 inode->i_gid = 0;
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
634 * inode to the back of the list so we don't spin on it. 634 * inode to the back of the list so we don't spin on it.
635 */ 635 */
636 if (!spin_trylock(&inode->i_lock)) { 636 if (!spin_trylock(&inode->i_lock)) {
637 list_move(&inode->i_lru, &sb->s_inode_lru); 637 list_move_tail(&inode->i_lru, &sb->s_inode_lru);
638 continue; 638 continue;
639 } 639 }
640 640
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e..f79dab83e17 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
21 */ 21 */
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/export.h>
24#include <linux/ioprio.h> 25#include <linux/ioprio.h>
25#include <linux/blkdev.h> 26#include <linux/blkdev.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d03672d04..f950059525f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
20#include <linux/statfs.h> 20#include <linux/statfs.h>
21#include <linux/cdrom.h> 21#include <linux/cdrom.h>
22#include <linux/parser.h> 22#include <linux/parser.h>
23#include <linux/mpage.h>
23 24
24#include "isofs.h" 25#include "isofs.h"
25#include "zisofs.h" 26#include "zisofs.h"
@@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
1148 1149
1149static int isofs_readpage(struct file *file, struct page *page) 1150static int isofs_readpage(struct file *file, struct page *page)
1150{ 1151{
1151 return block_read_full_page(page,isofs_get_block); 1152 return mpage_readpage(page, isofs_get_block);
1153}
1154
1155static int isofs_readpages(struct file *file, struct address_space *mapping,
1156 struct list_head *pages, unsigned nr_pages)
1157{
1158 return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
1152} 1159}
1153 1160
1154static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) 1161static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
1158 1165
1159static const struct address_space_operations isofs_aops = { 1166static const struct address_space_operations isofs_aops = {
1160 .readpage = isofs_readpage, 1167 .readpage = isofs_readpage,
1168 .readpages = isofs_readpages,
1161 .bmap = _isofs_bmap 1169 .bmap = _isofs_bmap
1162}; 1170};
1163 1171
@@ -1319,7 +1327,7 @@ static int isofs_read_inode(struct inode *inode)
1319 inode->i_mode = S_IFDIR | sbi->s_dmode; 1327 inode->i_mode = S_IFDIR | sbi->s_dmode;
1320 else 1328 else
1321 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 1329 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
1322 inode->i_nlink = 1; /* 1330 set_nlink(inode, 1); /*
1323 * Set to 1. We know there are 2, but 1331 * Set to 1. We know there are 2, but
1324 * the find utility tries to optimize 1332 * the find utility tries to optimize
1325 * if it is 2, and it screws up. It is 1333 * if it is 2, and it screws up. It is
@@ -1337,7 +1345,7 @@ static int isofs_read_inode(struct inode *inode)
1337 */ 1345 */
1338 inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; 1346 inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
1339 } 1347 }
1340 inode->i_nlink = 1; 1348 set_nlink(inode, 1);
1341 } 1349 }
1342 inode->i_uid = sbi->s_uid; 1350 inode->i_uid = sbi->s_uid;
1343 inode->i_gid = sbi->s_gid; 1351 inode->i_gid = sbi->s_gid;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 1fbc7de88f5..70e79d0c756 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -363,7 +363,7 @@ repeat:
363 break; 363 break;
364 case SIG('P', 'X'): 364 case SIG('P', 'X'):
365 inode->i_mode = isonum_733(rr->u.PX.mode); 365 inode->i_mode = isonum_733(rr->u.PX.mode);
366 inode->i_nlink = isonum_733(rr->u.PX.n_links); 366 set_nlink(inode, isonum_733(rr->u.PX.n_links));
367 inode->i_uid = isonum_733(rr->u.PX.uid); 367 inode->i_uid = isonum_733(rr->u.PX.uid);
368 inode->i_gid = isonum_733(rr->u.PX.gid); 368 inode->i_gid = isonum_733(rr->u.PX.gid);
369 break; 369 break;
@@ -496,7 +496,7 @@ repeat:
496 goto out; 496 goto out;
497 } 497 }
498 inode->i_mode = reloc->i_mode; 498 inode->i_mode = reloc->i_mode;
499 inode->i_nlink = reloc->i_nlink; 499 set_nlink(inode, reloc->i_nlink);
500 inode->i_uid = reloc->i_uid; 500 inode->i_uid = reloc->i_uid;
501 inode->i_gid = reloc->i_gid; 501 inode->i_gid = reloc->i_gid;
502 inode->i_rdev = reloc->i_rdev; 502 inode->i_rdev = reloc->i_rdev;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9fe061fb877..fea8dd661d2 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
1135 goto out; 1135 goto out;
1136 } 1136 }
1137 1137
1138 if (be32_to_cpu(sb->s_first) == 0 ||
1139 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1140 printk(KERN_WARNING
1141 "JBD: Invalid start block of journal: %u\n",
1142 be32_to_cpu(sb->s_first));
1143 goto out;
1144 }
1145
1138 return 0; 1146 return 0;
1139 1147
1140out: 1148out:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eef6979821a..68d704db787 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
352 J_ASSERT(commit_transaction->t_state == T_RUNNING); 352 J_ASSERT(commit_transaction->t_state == T_RUNNING);
353 353
354 trace_jbd2_start_commit(journal, commit_transaction); 354 trace_jbd2_start_commit(journal, commit_transaction);
355 jbd_debug(1, "JBD: starting commit of transaction %d\n", 355 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
356 commit_transaction->t_tid); 356 commit_transaction->t_tid);
357 357
358 write_lock(&journal->j_state_lock); 358 write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
427 __jbd2_journal_clean_checkpoint_list(journal); 427 __jbd2_journal_clean_checkpoint_list(journal);
428 spin_unlock(&journal->j_list_lock); 428 spin_unlock(&journal->j_list_lock);
429 429
430 jbd_debug (3, "JBD: commit phase 1\n"); 430 jbd_debug(3, "JBD2: commit phase 1\n");
431 431
432 /* 432 /*
433 * Switch to a new revoke table. 433 * Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
447 wake_up(&journal->j_wait_transaction_locked); 447 wake_up(&journal->j_wait_transaction_locked);
448 write_unlock(&journal->j_state_lock); 448 write_unlock(&journal->j_state_lock);
449 449
450 jbd_debug (3, "JBD: commit phase 2\n"); 450 jbd_debug(3, "JBD2: commit phase 2\n");
451 451
452 /* 452 /*
453 * Now start flushing things to disk, in the order they appear 453 * Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
462 WRITE_SYNC); 462 WRITE_SYNC);
463 blk_finish_plug(&plug); 463 blk_finish_plug(&plug);
464 464
465 jbd_debug(3, "JBD: commit phase 2\n"); 465 jbd_debug(3, "JBD2: commit phase 2\n");
466 466
467 /* 467 /*
468 * Way to go: we have now written out all of the data for a 468 * Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
522 522
523 J_ASSERT (bufs == 0); 523 J_ASSERT (bufs == 0);
524 524
525 jbd_debug(4, "JBD: get descriptor\n"); 525 jbd_debug(4, "JBD2: get descriptor\n");
526 526
527 descriptor = jbd2_journal_get_descriptor_buffer(journal); 527 descriptor = jbd2_journal_get_descriptor_buffer(journal);
528 if (!descriptor) { 528 if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
531 } 531 }
532 532
533 bh = jh2bh(descriptor); 533 bh = jh2bh(descriptor);
534 jbd_debug(4, "JBD: got buffer %llu (%p)\n", 534 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
535 (unsigned long long)bh->b_blocknr, bh->b_data); 535 (unsigned long long)bh->b_blocknr, bh->b_data);
536 header = (journal_header_t *)&bh->b_data[0]; 536 header = (journal_header_t *)&bh->b_data[0];
537 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 537 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
625 commit_transaction->t_buffers == NULL || 625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) { 626 space_left < tag_bytes + 16) {
627 627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs); 628 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
629 629
630 /* Write an end-of-descriptor marker before 630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to 631 submitting the IOs. "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
707 so we incur less scheduling load. 707 so we incur less scheduling load.
708 */ 708 */
709 709
710 jbd_debug(3, "JBD: commit phase 3\n"); 710 jbd_debug(3, "JBD2: commit phase 3\n");
711 711
712 /* 712 /*
713 * akpm: these are BJ_IO, and j_list_lock is not needed. 713 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
771 771
772 J_ASSERT (commit_transaction->t_shadow_list == NULL); 772 J_ASSERT (commit_transaction->t_shadow_list == NULL);
773 773
774 jbd_debug(3, "JBD: commit phase 4\n"); 774 jbd_debug(3, "JBD2: commit phase 4\n");
775 775
776 /* Here we wait for the revoke record and descriptor record buffers */ 776 /* Here we wait for the revoke record and descriptor record buffers */
777 wait_for_ctlbuf: 777 wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
801 if (err) 801 if (err)
802 jbd2_journal_abort(journal, err); 802 jbd2_journal_abort(journal, err);
803 803
804 jbd_debug(3, "JBD: commit phase 5\n"); 804 jbd_debug(3, "JBD2: commit phase 5\n");
805 write_lock(&journal->j_state_lock); 805 write_lock(&journal->j_state_lock);
806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
807 commit_transaction->t_state = T_COMMIT_JFLUSH; 807 commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
830 transaction can be removed from any checkpoint list it was on 830 transaction can be removed from any checkpoint list it was on
831 before. */ 831 before. */
832 832
833 jbd_debug(3, "JBD: commit phase 6\n"); 833 jbd_debug(3, "JBD2: commit phase 6\n");
834 834
835 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 835 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
836 J_ASSERT(commit_transaction->t_buffers == NULL); 836 J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
964 964
965 /* Done with this transaction! */ 965 /* Done with this transaction! */
966 966
967 jbd_debug(3, "JBD: commit phase 7\n"); 967 jbd_debug(3, "JBD2: commit phase 7\n");
968 968
969 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 969 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
970 970
@@ -1039,7 +1039,7 @@ restart_loop:
1039 journal->j_commit_callback(journal, commit_transaction); 1039 journal->j_commit_callback(journal, commit_transaction);
1040 1040
1041 trace_jbd2_end_commit(journal, commit_transaction); 1041 trace_jbd2_end_commit(journal, commit_transaction);
1042 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1042 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1043 journal->j_commit_sequence, journal->j_tail_sequence); 1043 journal->j_commit_sequence, journal->j_tail_sequence);
1044 if (to_free) 1044 if (to_free)
1045 kfree(commit_transaction); 1045 kfree(commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f24df13adc4..0fa0123151d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
491 */ 491 */
492 492
493 journal->j_commit_request = target; 493 journal->j_commit_request = target;
494 jbd_debug(1, "JBD: requesting commit %d/%d\n", 494 jbd_debug(1, "JBD2: requesting commit %d/%d\n",
495 journal->j_commit_request, 495 journal->j_commit_request,
496 journal->j_commit_sequence); 496 journal->j_commit_sequence);
497 wake_up(&journal->j_wait_commit); 497 wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
500 /* This should never happen, but if it does, preserve 500 /* This should never happen, but if it does, preserve
501 the evidence before kjournald goes into a loop and 501 the evidence before kjournald goes into a loop and
502 increments j_commit_sequence beyond all recognition. */ 502 increments j_commit_sequence beyond all recognition. */
503 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", 503 WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
504 journal->j_commit_request, 504 journal->j_commit_request,
505 journal->j_commit_sequence, 505 journal->j_commit_sequence,
506 target, journal->j_running_transaction ? 506 target, journal->j_running_transaction ?
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
645 } 645 }
646#endif 646#endif
647 while (tid_gt(tid, journal->j_commit_sequence)) { 647 while (tid_gt(tid, journal->j_commit_sequence)) {
648 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 648 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
649 tid, journal->j_commit_sequence); 649 tid, journal->j_commit_sequence);
650 wake_up(&journal->j_wait_commit); 650 wake_up(&journal->j_wait_commit);
651 read_unlock(&journal->j_state_lock); 651 read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
1093 first = be32_to_cpu(sb->s_first); 1093 first = be32_to_cpu(sb->s_first);
1094 last = be32_to_cpu(sb->s_maxlen); 1094 last = be32_to_cpu(sb->s_maxlen);
1095 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { 1095 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1096 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", 1096 printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
1097 first, last); 1097 first, last);
1098 journal_fail_superblock(journal); 1098 journal_fail_superblock(journal);
1099 return -EINVAL; 1099 return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1139 */ 1139 */
1140 if (sb->s_start == 0 && journal->j_tail_sequence == 1140 if (sb->s_start == 0 && journal->j_tail_sequence ==
1141 journal->j_transaction_sequence) { 1141 journal->j_transaction_sequence) {
1142 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 1142 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1143 "(start %ld, seq %d, errno %d)\n", 1143 "(start %ld, seq %d, errno %d)\n",
1144 journal->j_tail, journal->j_tail_sequence, 1144 journal->j_tail, journal->j_tail_sequence,
1145 journal->j_errno); 1145 journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1163 } 1163 }
1164 1164
1165 read_lock(&journal->j_state_lock); 1165 read_lock(&journal->j_state_lock);
1166 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1166 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
1167 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1167 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
1168 1168
1169 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1169 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
1216 ll_rw_block(READ, 1, &bh); 1216 ll_rw_block(READ, 1, &bh);
1217 wait_on_buffer(bh); 1217 wait_on_buffer(bh);
1218 if (!buffer_uptodate(bh)) { 1218 if (!buffer_uptodate(bh)) {
1219 printk (KERN_ERR 1219 printk(KERN_ERR
1220 "JBD: IO error reading journal superblock\n"); 1220 "JBD2: IO error reading journal superblock\n");
1221 goto out; 1221 goto out;
1222 } 1222 }
1223 } 1223 }
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
1228 1228
1229 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || 1229 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1230 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1230 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1231 printk(KERN_WARNING "JBD: no valid journal superblock found\n"); 1231 printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
1232 goto out; 1232 goto out;
1233 } 1233 }
1234 1234
@@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal)
1240 journal->j_format_version = 2; 1240 journal->j_format_version = 2;
1241 break; 1241 break;
1242 default: 1242 default:
1243 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); 1243 printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
1244 goto out; 1244 goto out;
1245 } 1245 }
1246 1246
1247 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) 1247 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1248 journal->j_maxlen = be32_to_cpu(sb->s_maxlen); 1248 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1249 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { 1249 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1250 printk (KERN_WARNING "JBD: journal file too short\n"); 1250 printk(KERN_WARNING "JBD2: journal file too short\n");
1251 goto out;
1252 }
1253
1254 if (be32_to_cpu(sb->s_first) == 0 ||
1255 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1256 printk(KERN_WARNING
1257 "JBD2: Invalid start block of journal: %u\n",
1258 be32_to_cpu(sb->s_first));
1251 goto out; 1259 goto out;
1252 } 1260 }
1253 1261
@@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
1310 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || 1318 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1311 (sb->s_feature_incompat & 1319 (sb->s_feature_incompat &
1312 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { 1320 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1313 printk (KERN_WARNING 1321 printk(KERN_WARNING
1314 "JBD: Unrecognised features on journal\n"); 1322 "JBD2: Unrecognised features on journal\n");
1315 return -EINVAL; 1323 return -EINVAL;
1316 } 1324 }
1317 } 1325 }
@@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
1346 return 0; 1354 return 0;
1347 1355
1348recovery_error: 1356recovery_error:
1349 printk (KERN_WARNING "JBD: recovery failed\n"); 1357 printk(KERN_WARNING "JBD2: recovery failed\n");
1350 return -EIO; 1358 return -EIO;
1351} 1359}
1352 1360
@@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
1577 struct buffer_head *bh; 1585 struct buffer_head *bh;
1578 1586
1579 printk(KERN_WARNING 1587 printk(KERN_WARNING
1580 "JBD: Converting superblock from version 1 to 2.\n"); 1588 "JBD2: Converting superblock from version 1 to 2.\n");
1581 1589
1582 /* Pre-initialise new fields to zero */ 1590 /* Pre-initialise new fields to zero */
1583 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); 1591 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1694 if (!journal->j_tail) 1702 if (!journal->j_tail)
1695 goto no_recovery; 1703 goto no_recovery;
1696 1704
1697 printk (KERN_WARNING "JBD: %s recovery information on journal\n", 1705 printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
1698 write ? "Clearing" : "Ignoring"); 1706 write ? "Clearing" : "Ignoring");
1699 1707
1700 err = jbd2_journal_skip_recovery(journal); 1708 err = jbd2_journal_skip_recovery(journal);
@@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
2020 retval = 0; 2028 retval = 0;
2021 if (!jbd2_journal_head_cache) { 2029 if (!jbd2_journal_head_cache) {
2022 retval = -ENOMEM; 2030 retval = -ENOMEM;
2023 printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); 2031 printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
2024 } 2032 }
2025 return retval; 2033 return retval;
2026} 2034}
@@ -2383,7 +2391,7 @@ static void __exit journal_exit(void)
2383#ifdef CONFIG_JBD2_DEBUG 2391#ifdef CONFIG_JBD2_DEBUG
2384 int n = atomic_read(&nr_journal_heads); 2392 int n = atomic_read(&nr_journal_heads);
2385 if (n) 2393 if (n)
2386 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2394 printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
2387#endif 2395#endif
2388 jbd2_remove_debugfs_entry(); 2396 jbd2_remove_debugfs_entry();
2389 jbd2_remove_jbd_stats_proc_entry(); 2397 jbd2_remove_jbd_stats_proc_entry();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1cad869494f..da6d7baf139 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
89 err = jbd2_journal_bmap(journal, next, &blocknr); 89 err = jbd2_journal_bmap(journal, next, &blocknr);
90 90
91 if (err) { 91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n", 92 printk(KERN_ERR "JBD2: bad block at offset %u\n",
93 next); 93 next);
94 goto failed; 94 goto failed;
95 } 95 }
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
138 *bhp = NULL; 138 *bhp = NULL;
139 139
140 if (offset >= journal->j_maxlen) { 140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n"); 141 printk(KERN_ERR "JBD2: corrupted journal superblock\n");
142 return -EIO; 142 return -EIO;
143 } 143 }
144 144
145 err = jbd2_journal_bmap(journal, offset, &blocknr); 145 err = jbd2_journal_bmap(journal, offset, &blocknr);
146 146
147 if (err) { 147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n", 148 printk(KERN_ERR "JBD2: bad block at offset %u\n",
149 offset); 149 offset);
150 return err; 150 return err;
151 } 151 }
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
163 } 163 }
164 164
165 if (!buffer_uptodate(bh)) { 165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n", 166 printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
167 offset); 167 offset);
168 brelse(bh); 168 brelse(bh);
169 return -EIO; 169 return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
251 if (!err) 251 if (!err)
252 err = do_one_pass(journal, &info, PASS_REPLAY); 252 err = do_one_pass(journal, &info, PASS_REPLAY);
253 253
254 jbd_debug(1, "JBD: recovery, exit status %d, " 254 jbd_debug(1, "JBD2: recovery, exit status %d, "
255 "recovered transactions %u to %u\n", 255 "recovered transactions %u to %u\n",
256 err, info.start_transaction, info.end_transaction); 256 err, info.start_transaction, info.end_transaction);
257 jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", 257 jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes); 258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259 259
260 /* Restart the log at the next transaction ID, thus invalidating 260 /* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
293 err = do_one_pass(journal, &info, PASS_SCAN); 293 err = do_one_pass(journal, &info, PASS_SCAN);
294 294
295 if (err) { 295 if (err) {
296 printk(KERN_ERR "JBD: error %d scanning journal\n", err); 296 printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
297 ++journal->j_transaction_sequence; 297 ++journal->j_transaction_sequence;
298 } else { 298 } else {
299#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
300 int dropped = info.end_transaction - 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence); 301 be32_to_cpu(journal->j_superblock->s_sequence);
302 jbd_debug(1, 302 jbd_debug(1,
303 "JBD: ignoring %d transaction%s from the journal.\n", 303 "JBD2: ignoring %d transaction%s from the journal.\n",
304 dropped, (dropped == 1) ? "" : "s"); 304 dropped, (dropped == 1) ? "" : "s");
305#endif 305#endif
306 journal->j_transaction_sequence = ++info.end_transaction; 306 journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
338 wrap(journal, *next_log_block); 338 wrap(journal, *next_log_block);
339 err = jread(&obh, journal, io_block); 339 err = jread(&obh, journal, io_block);
340 if (err) { 340 if (err) {
341 printk(KERN_ERR "JBD: IO error %d recovering block " 341 printk(KERN_ERR "JBD2: IO error %d recovering block "
342 "%lu in log\n", err, io_block); 342 "%lu in log\n", err, io_block);
343 return 1; 343 return 1;
344 } else { 344 } else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
411 * either the next descriptor block or the final commit 411 * either the next descriptor block or the final commit
412 * record. */ 412 * record. */
413 413
414 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 414 jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
415 err = jread(&bh, journal, next_log_block); 415 err = jread(&bh, journal, next_log_block);
416 if (err) 416 if (err)
417 goto failed; 417 goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
491 /* Recover what we can, but 491 /* Recover what we can, but
492 * report failure at the end. */ 492 * report failure at the end. */
493 success = err; 493 success = err;
494 printk (KERN_ERR 494 printk(KERN_ERR
495 "JBD: IO error %d recovering " 495 "JBD2: IO error %d recovering "
496 "block %ld in log\n", 496 "block %ld in log\n",
497 err, io_block); 497 err, io_block);
498 } else { 498 } else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
520 journal->j_blocksize); 520 journal->j_blocksize);
521 if (nbh == NULL) { 521 if (nbh == NULL) {
522 printk(KERN_ERR 522 printk(KERN_ERR
523 "JBD: Out of memory " 523 "JBD2: Out of memory "
524 "during recovery.\n"); 524 "during recovery.\n");
525 err = -ENOMEM; 525 err = -ENOMEM;
526 brelse(bh); 526 brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
689 /* It's really bad news if different passes end up at 689 /* It's really bad news if different passes end up at
690 * different places (but possible due to IO errors). */ 690 * different places (but possible due to IO errors). */
691 if (info->end_transaction != next_commit_ID) { 691 if (info->end_transaction != next_commit_ID) {
692 printk (KERN_ERR "JBD: recovery pass %d ended at " 692 printk(KERN_ERR "JBD2: recovery pass %d ended at "
693 "transaction %u, expected %u\n", 693 "transaction %u, expected %u\n",
694 pass, next_commit_ID, info->end_transaction); 694 pass, next_commit_ID, info->end_transaction);
695 if (!success) 695 if (!success)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cd..a0e41a4c080 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/bug.h>
30#include <linux/module.h> 31#include <linux/module.h>
31 32
32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
115 */ 116 */
116 117
117static int start_this_handle(journal_t *journal, handle_t *handle, 118static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask) 119 gfp_t gfp_mask)
119{ 120{
120 transaction_t *transaction, *new_transaction = NULL; 121 transaction_t *transaction, *new_transaction = NULL;
121 tid_t tid; 122 tid_t tid;
@@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
124 unsigned long ts = jiffies; 125 unsigned long ts = jiffies;
125 126
126 if (nblocks > journal->j_max_transaction_buffers) { 127 if (nblocks > journal->j_max_transaction_buffers) {
127 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 128 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
128 current->comm, nblocks, 129 current->comm, nblocks,
129 journal->j_max_transaction_buffers); 130 journal->j_max_transaction_buffers);
130 return -ENOSPC; 131 return -ENOSPC;
@@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks)
320 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 321 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
321 * on failure. 322 * on failure.
322 */ 323 */
323handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 324handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
324{ 325{
325 handle_t *handle = journal_current_handle(); 326 handle_t *handle = journal_current_handle();
326 int err; 327 int err;
@@ -443,7 +444,7 @@ out:
443 * transaction capabable of guaranteeing the requested number of 444 * transaction capabable of guaranteeing the requested number of
444 * credits. 445 * credits.
445 */ 446 */
446int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) 447int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
447{ 448{
448 transaction_t *transaction = handle->h_transaction; 449 transaction_t *transaction = handle->h_transaction;
449 journal_t *journal = transaction->t_journal; 450 journal_t *journal = transaction->t_journal;
@@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
563 char b[BDEVNAME_SIZE]; 564 char b[BDEVNAME_SIZE];
564 565
565 printk(KERN_WARNING 566 printk(KERN_WARNING
566 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 567 "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
567 "There's a risk of filesystem corruption in case of system " 568 "There's a risk of filesystem corruption in case of system "
568 "crash.\n", 569 "crash.\n",
569 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 570 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
@@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
1049 * mark dirty metadata which needs to be journaled as part of the current 1050 * mark dirty metadata which needs to be journaled as part of the current
1050 * transaction. 1051 * transaction.
1051 * 1052 *
1053 * The buffer must have previously had jbd2_journal_get_write_access()
1054 * called so that it has a valid journal_head attached to the buffer
1055 * head.
1056 *
1052 * The buffer is placed on the transaction's metadata list and is marked 1057 * The buffer is placed on the transaction's metadata list and is marked
1053 * as belonging to the transaction. 1058 * as belonging to the transaction.
1054 * 1059 *
@@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1065 transaction_t *transaction = handle->h_transaction; 1070 transaction_t *transaction = handle->h_transaction;
1066 journal_t *journal = transaction->t_journal; 1071 journal_t *journal = transaction->t_journal;
1067 struct journal_head *jh = bh2jh(bh); 1072 struct journal_head *jh = bh2jh(bh);
1073 int ret = 0;
1068 1074
1069 jbd_debug(5, "journal_head %p\n", jh); 1075 jbd_debug(5, "journal_head %p\n", jh);
1070 JBUFFER_TRACE(jh, "entry"); 1076 JBUFFER_TRACE(jh, "entry");
1071 if (is_handle_aborted(handle)) 1077 if (is_handle_aborted(handle))
1072 goto out; 1078 goto out;
1079 if (!buffer_jbd(bh)) {
1080 ret = -EUCLEAN;
1081 goto out;
1082 }
1073 1083
1074 jbd_lock_bh_state(bh); 1084 jbd_lock_bh_state(bh);
1075 1085
@@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1093 */ 1103 */
1094 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { 1104 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1095 JBUFFER_TRACE(jh, "fastpath"); 1105 JBUFFER_TRACE(jh, "fastpath");
1096 J_ASSERT_JH(jh, jh->b_transaction == 1106 if (unlikely(jh->b_transaction !=
1097 journal->j_running_transaction); 1107 journal->j_running_transaction)) {
1108 printk(KERN_EMERG "JBD: %s: "
1109 "jh->b_transaction (%llu, %p, %u) != "
1110 "journal->j_running_transaction (%p, %u)",
1111 journal->j_devname,
1112 (unsigned long long) bh->b_blocknr,
1113 jh->b_transaction,
1114 jh->b_transaction ? jh->b_transaction->t_tid : 0,
1115 journal->j_running_transaction,
1116 journal->j_running_transaction ?
1117 journal->j_running_transaction->t_tid : 0);
1118 ret = -EINVAL;
1119 }
1098 goto out_unlock_bh; 1120 goto out_unlock_bh;
1099 } 1121 }
1100 1122
@@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1108 */ 1130 */
1109 if (jh->b_transaction != transaction) { 1131 if (jh->b_transaction != transaction) {
1110 JBUFFER_TRACE(jh, "already on other transaction"); 1132 JBUFFER_TRACE(jh, "already on other transaction");
1111 J_ASSERT_JH(jh, jh->b_transaction == 1133 if (unlikely(jh->b_transaction !=
1112 journal->j_committing_transaction); 1134 journal->j_committing_transaction)) {
1113 J_ASSERT_JH(jh, jh->b_next_transaction == transaction); 1135 printk(KERN_EMERG "JBD: %s: "
1136 "jh->b_transaction (%llu, %p, %u) != "
1137 "journal->j_committing_transaction (%p, %u)",
1138 journal->j_devname,
1139 (unsigned long long) bh->b_blocknr,
1140 jh->b_transaction,
1141 jh->b_transaction ? jh->b_transaction->t_tid : 0,
1142 journal->j_committing_transaction,
1143 journal->j_committing_transaction ?
1144 journal->j_committing_transaction->t_tid : 0);
1145 ret = -EINVAL;
1146 }
1147 if (unlikely(jh->b_next_transaction != transaction)) {
1148 printk(KERN_EMERG "JBD: %s: "
1149 "jh->b_next_transaction (%llu, %p, %u) != "
1150 "transaction (%p, %u)",
1151 journal->j_devname,
1152 (unsigned long long) bh->b_blocknr,
1153 jh->b_next_transaction,
1154 jh->b_next_transaction ?
1155 jh->b_next_transaction->t_tid : 0,
1156 transaction, transaction->t_tid);
1157 ret = -EINVAL;
1158 }
1114 /* And this case is illegal: we can't reuse another 1159 /* And this case is illegal: we can't reuse another
1115 * transaction's data buffer, ever. */ 1160 * transaction's data buffer, ever. */
1116 goto out_unlock_bh; 1161 goto out_unlock_bh;
@@ -1127,7 +1172,8 @@ out_unlock_bh:
1127 jbd_unlock_bh_state(bh); 1172 jbd_unlock_bh_state(bh);
1128out: 1173out:
1129 JBUFFER_TRACE(jh, "exit"); 1174 JBUFFER_TRACE(jh, "exit");
1130 return 0; 1175 WARN_ON(ret); /* All errors are bugs, so dump the stack */
1176 return ret;
1131} 1177}
1132 1178
1133/* 1179/*
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d2..5b6c9d1a2fb 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
53 return 0; 53 return 0;
54} 54}
55 55
56/*
57 * jffs2_selected_compress:
58 * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
59 * If 0, just take the first available compression mode.
60 * @data_in: Pointer to uncompressed data
61 * @cpage_out: Pointer to returned pointer to buffer for compressed data
62 * @datalen: On entry, holds the amount of data available for compression.
63 * On exit, expected to hold the amount of data actually compressed.
64 * @cdatalen: On entry, holds the amount of space available for compressed
65 * data. On exit, expected to hold the actual size of the compressed
66 * data.
67 *
68 * Returns: the compression type used. Zero is used to show that the data
69 * could not be compressed; probably because we couldn't find the requested
70 * compression mode.
71 */
72static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
73 unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
74{
75 struct jffs2_compressor *this;
76 int err, ret = JFFS2_COMPR_NONE;
77 uint32_t orig_slen, orig_dlen;
78 char *output_buf;
79
80 output_buf = kmalloc(*cdatalen, GFP_KERNEL);
81 if (!output_buf) {
82 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
83 return ret;
84 }
85 orig_slen = *datalen;
86 orig_dlen = *cdatalen;
87 spin_lock(&jffs2_compressor_list_lock);
88 list_for_each_entry(this, &jffs2_compressor_list, list) {
89 /* Skip decompress-only and disabled modules */
90 if (!this->compress || this->disabled)
91 continue;
92
93 /* Skip if not the desired compression type */
94 if (compr && (compr != this->compr))
95 continue;
96
97 /*
98 * Either compression type was unspecified, or we found our
99 * compressor; either way, we're good to go.
100 */
101 this->usecount++;
102 spin_unlock(&jffs2_compressor_list_lock);
103
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 err = this->compress(data_in, output_buf, datalen, cdatalen);
107
108 spin_lock(&jffs2_compressor_list_lock);
109 this->usecount--;
110 if (!err) {
111 /* Success */
112 ret = this->compr;
113 this->stat_compr_blocks++;
114 this->stat_compr_orig_size += *datalen;
115 this->stat_compr_new_size += *cdatalen;
116 break;
117 }
118 }
119 spin_unlock(&jffs2_compressor_list_lock);
120 if (ret == JFFS2_COMPR_NONE)
121 kfree(output_buf);
122 else
123 *cpage_out = output_buf;
124
125 return ret;
126}
127
56/* jffs2_compress: 128/* jffs2_compress:
57 * @data_in: Pointer to uncompressed data 129 * @data_in: Pointer to uncompressed data
58 * @cpage_out: Pointer to returned pointer to buffer for compressed data 130 * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
76 uint32_t *datalen, uint32_t *cdatalen) 148 uint32_t *datalen, uint32_t *cdatalen)
77{ 149{
78 int ret = JFFS2_COMPR_NONE; 150 int ret = JFFS2_COMPR_NONE;
79 int compr_ret; 151 int mode, compr_ret;
80 struct jffs2_compressor *this, *best=NULL; 152 struct jffs2_compressor *this, *best=NULL;
81 unsigned char *output_buf = NULL, *tmp_buf; 153 unsigned char *output_buf = NULL, *tmp_buf;
82 uint32_t orig_slen, orig_dlen; 154 uint32_t orig_slen, orig_dlen;
83 uint32_t best_slen=0, best_dlen=0; 155 uint32_t best_slen=0, best_dlen=0;
84 156
85 switch (jffs2_compression_mode) { 157 if (c->mount_opts.override_compr)
158 mode = c->mount_opts.compr;
159 else
160 mode = jffs2_compression_mode;
161
162 switch (mode) {
86 case JFFS2_COMPR_MODE_NONE: 163 case JFFS2_COMPR_MODE_NONE:
87 break; 164 break;
88 case JFFS2_COMPR_MODE_PRIORITY: 165 case JFFS2_COMPR_MODE_PRIORITY:
89 output_buf = kmalloc(*cdatalen,GFP_KERNEL); 166 ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
90 if (!output_buf) { 167 cdatalen);
91 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
92 goto out;
93 }
94 orig_slen = *datalen;
95 orig_dlen = *cdatalen;
96 spin_lock(&jffs2_compressor_list_lock);
97 list_for_each_entry(this, &jffs2_compressor_list, list) {
98 /* Skip decompress-only backwards-compatibility and disabled modules */
99 if ((!this->compress)||(this->disabled))
100 continue;
101
102 this->usecount++;
103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--;
109 if (!compr_ret) {
110 ret = this->compr;
111 this->stat_compr_blocks++;
112 this->stat_compr_orig_size += *datalen;
113 this->stat_compr_new_size += *cdatalen;
114 break;
115 }
116 }
117 spin_unlock(&jffs2_compressor_list_lock);
118 if (ret == JFFS2_COMPR_NONE)
119 kfree(output_buf);
120 break; 168 break;
121 case JFFS2_COMPR_MODE_SIZE: 169 case JFFS2_COMPR_MODE_SIZE:
122 case JFFS2_COMPR_MODE_FAVOURLZO: 170 case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
174 best->stat_compr_orig_size += best_slen; 222 best->stat_compr_orig_size += best_slen;
175 best->stat_compr_new_size += best_dlen; 223 best->stat_compr_new_size += best_dlen;
176 ret = best->compr; 224 ret = best->compr;
225 *cpage_out = output_buf;
177 } 226 }
178 spin_unlock(&jffs2_compressor_list_lock); 227 spin_unlock(&jffs2_compressor_list_lock);
179 break; 228 break;
229 case JFFS2_COMPR_MODE_FORCELZO:
230 ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
231 cpage_out, datalen, cdatalen);
232 break;
233 case JFFS2_COMPR_MODE_FORCEZLIB:
234 ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
235 cpage_out, datalen, cdatalen);
236 break;
180 default: 237 default:
181 printk(KERN_ERR "JFFS2: unknown compression mode.\n"); 238 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
182 } 239 }
183 out: 240
184 if (ret == JFFS2_COMPR_NONE) { 241 if (ret == JFFS2_COMPR_NONE) {
185 *cpage_out = data_in; 242 *cpage_out = data_in;
186 *datalen = *cdatalen; 243 *datalen = *cdatalen;
187 none_stat_compr_blocks++; 244 none_stat_compr_blocks++;
188 none_stat_compr_size += *datalen; 245 none_stat_compr_size += *datalen;
189 } 246 }
190 else {
191 *cpage_out = output_buf;
192 }
193 return ret; 247 return ret;
194} 248}
195 249
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab3..5e91d578f4e 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
40#define JFFS2_COMPR_MODE_PRIORITY 1 40#define JFFS2_COMPR_MODE_PRIORITY 1
41#define JFFS2_COMPR_MODE_SIZE 2 41#define JFFS2_COMPR_MODE_SIZE 2
42#define JFFS2_COMPR_MODE_FAVOURLZO 3 42#define JFFS2_COMPR_MODE_FAVOURLZO 3
43#define JFFS2_COMPR_MODE_FORCELZO 4
44#define JFFS2_COMPR_MODE_FORCEZLIB 5
43 45
44#define FAVOUR_LZO_PERCENT 80 46#define FAVOUR_LZO_PERCENT 80
45 47
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9659b7c0046..be6169bd8ac 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
245 ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, 245 ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
246 dentry->d_name.len, dead_f, now); 246 dentry->d_name.len, dead_f, now);
247 if (dead_f->inocache) 247 if (dead_f->inocache)
248 dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; 248 set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink);
249 if (!ret) 249 if (!ret)
250 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 250 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
251 return ret; 251 return ret;
@@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
278 278
279 if (!ret) { 279 if (!ret) {
280 mutex_lock(&f->sem); 280 mutex_lock(&f->sem);
281 old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; 281 set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink);
282 mutex_unlock(&f->sem); 282 mutex_unlock(&f->sem);
283 d_instantiate(dentry, old_dentry->d_inode); 283 d_instantiate(dentry, old_dentry->d_inode);
284 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 284 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
@@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
497 f = JFFS2_INODE_INFO(inode); 497 f = JFFS2_INODE_INFO(inode);
498 498
499 /* Directories get nlink 2 at start */ 499 /* Directories get nlink 2 at start */
500 inode->i_nlink = 2; 500 set_nlink(inode, 2);
501 /* but ic->pino_nlink is the parent ino# */ 501 /* but ic->pino_nlink is the parent ino# */
502 f->inocache->pino_nlink = dir_i->i_ino; 502 f->inocache->pino_nlink = dir_i->i_ino;
503 503
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bbcb9755dd2..4b8afe39a87 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
278 inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); 278 inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
279 inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); 279 inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
280 280
281 inode->i_nlink = f->inocache->pino_nlink; 281 set_nlink(inode, f->inocache->pino_nlink);
282 282
283 inode->i_blocks = (inode->i_size + 511) >> 9; 283 inode->i_blocks = (inode->i_size + 511) >> 9;
284 284
@@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
291 case S_IFDIR: 291 case S_IFDIR:
292 { 292 {
293 struct jffs2_full_dirent *fd; 293 struct jffs2_full_dirent *fd;
294 inode->i_nlink = 2; /* parent and '.' */ 294 set_nlink(inode, 2); /* parent and '.' */
295 295
296 for (fd=f->dents; fd; fd = fd->next) { 296 for (fd=f->dents; fd; fd = fd->next) {
297 if (fd->type == DT_DIR && fd->ino) 297 if (fd->type == DT_DIR && fd->ino)
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
379 jffs2_do_setattr(inode, &iattr); 379 jffs2_do_setattr(inode, &iattr);
380} 380}
381 381
382int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) 382int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
383{ 383{
384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
385 385
@@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
453 iput(inode); 453 iput(inode);
454 return ERR_PTR(ret); 454 return ERR_PTR(ret);
455 } 455 }
456 inode->i_nlink = 1; 456 set_nlink(inode, 1);
457 inode->i_ino = je32_to_cpu(ri->ino); 457 inode->i_ino = je32_to_cpu(ri->ino);
458 inode->i_mode = jemode_to_cpu(ri->mode); 458 inode->i_mode = jemode_to_cpu(ri->mode);
459 inode->i_gid = je16_to_cpu(ri->gid); 459 inode->i_gid = je16_to_cpu(ri->gid);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a5..55a0c1dcead 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
29 29
30struct jffs2_inodirty; 30struct jffs2_inodirty;
31 31
32struct jffs2_mount_opts {
33 bool override_compr;
34 unsigned int compr;
35};
36
32/* A struct for the overall file system control. Pointers to 37/* A struct for the overall file system control. Pointers to
33 jffs2_sb_info structs are named `c' in the source code. 38 jffs2_sb_info structs are named `c' in the source code.
34 Nee jffs_control 39 Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
126#endif 131#endif
127 132
128 struct jffs2_summary *summary; /* Summary information */ 133 struct jffs2_summary *summary; /* Summary information */
134 struct jffs2_mount_opts mount_opts;
129 135
130#ifdef CONFIG_JFFS2_FS_XATTR 136#ifdef CONFIG_JFFS2_FS_XATTR
131#define XATTRINDEX_HASHSIZE (57) 137#define XATTRINDEX_HASHSIZE (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0..ab65ee3ec85 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_do_remount_fs(struct super_block *, int *, char *);
180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
181void jffs2_gc_release_inode(struct jffs2_sb_info *c, 181void jffs2_gc_release_inode(struct jffs2_sb_info *c,
182 struct jffs2_inode_info *f); 182 struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d0..28107ca136e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
275 else 275 else
276 c->mtd->unpoint(c->mtd, 0, c->mtd->size); 276 c->mtd->unpoint(c->mtd, 0, c->mtd->size);
277#endif 277#endif
278 if (s) 278 kfree(s);
279 kfree(s);
280
281 return ret; 279 return ret;
282} 280}
283 281
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e30008..e7e97445411 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/parser.h>
20#include <linux/jffs2.h> 21#include <linux/jffs2.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/mtd/super.h> 23#include <linux/mtd/super.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
24#include <linux/namei.h> 25#include <linux/namei.h>
26#include <linux/seq_file.h>
25#include <linux/exportfs.h> 27#include <linux/exportfs.h>
26#include "compr.h" 28#include "compr.h"
27#include "nodelist.h" 29#include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
75 unlock_super(sb); 77 unlock_super(sb);
76} 78}
77 79
80static const char *jffs2_compr_name(unsigned int compr)
81{
82 switch (compr) {
83 case JFFS2_COMPR_MODE_NONE:
84 return "none";
85#ifdef CONFIG_JFFS2_LZO
86 case JFFS2_COMPR_MODE_FORCELZO:
87 return "lzo";
88#endif
89#ifdef CONFIG_JFFS2_ZLIB
90 case JFFS2_COMPR_MODE_FORCEZLIB:
91 return "zlib";
92#endif
93 default:
94 /* should never happen; programmer error */
95 WARN_ON(1);
96 return "";
97 }
98}
99
100static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
101{
102 struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
103 struct jffs2_mount_opts *opts = &c->mount_opts;
104
105 if (opts->override_compr)
106 seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
107
108 return 0;
109}
110
78static int jffs2_sync_fs(struct super_block *sb, int wait) 111static int jffs2_sync_fs(struct super_block *sb, int wait)
79{ 112{
80 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 113 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
133 .fh_to_parent = jffs2_fh_to_parent, 166 .fh_to_parent = jffs2_fh_to_parent,
134}; 167};
135 168
169/*
170 * JFFS2 mount options.
171 *
172 * Opt_override_compr: override default compressor
173 * Opt_err: just end of array marker
174 */
175enum {
176 Opt_override_compr,
177 Opt_err,
178};
179
180static const match_table_t tokens = {
181 {Opt_override_compr, "compr=%s"},
182 {Opt_err, NULL},
183};
184
185static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
186{
187 substring_t args[MAX_OPT_ARGS];
188 char *p, *name;
189
190 if (!data)
191 return 0;
192
193 while ((p = strsep(&data, ","))) {
194 int token;
195
196 if (!*p)
197 continue;
198
199 token = match_token(p, tokens, args);
200 switch (token) {
201 case Opt_override_compr:
202 name = match_strdup(&args[0]);
203
204 if (!name)
205 return -ENOMEM;
206 if (!strcmp(name, "none"))
207 c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
208#ifdef CONFIG_JFFS2_LZO
209 else if (!strcmp(name, "lzo"))
210 c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
211#endif
212#ifdef CONFIG_JFFS2_ZLIB
213 else if (!strcmp(name, "zlib"))
214 c->mount_opts.compr =
215 JFFS2_COMPR_MODE_FORCEZLIB;
216#endif
217 else {
218 printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
219 name);
220 kfree(name);
221 return -EINVAL;
222 }
223 kfree(name);
224 c->mount_opts.override_compr = true;
225 break;
226 default:
227 printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
228 p);
229 return -EINVAL;
230 }
231 }
232
233 return 0;
234}
235
236static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
237{
238 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
239 int err;
240
241 err = jffs2_parse_options(c, data);
242 if (err)
243 return -EINVAL;
244
245 return jffs2_do_remount_fs(sb, flags, data);
246}
247
136static const struct super_operations jffs2_super_operations = 248static const struct super_operations jffs2_super_operations =
137{ 249{
138 .alloc_inode = jffs2_alloc_inode, 250 .alloc_inode = jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
143 .remount_fs = jffs2_remount_fs, 255 .remount_fs = jffs2_remount_fs,
144 .evict_inode = jffs2_evict_inode, 256 .evict_inode = jffs2_evict_inode,
145 .dirty_inode = jffs2_dirty_inode, 257 .dirty_inode = jffs2_dirty_inode,
258 .show_options = jffs2_show_options,
146 .sync_fs = jffs2_sync_fs, 259 .sync_fs = jffs2_sync_fs,
147}; 260};
148 261
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
166 c->os_priv = sb; 279 c->os_priv = sb;
167 sb->s_fs_info = c; 280 sb->s_fs_info = c;
168 281
282 ret = jffs2_parse_options(c, data);
283 if (ret) {
284 kfree(c);
285 return -EINVAL;
286 }
287
169 /* Initialize JFFS2 superblock locks, the further initialization will 288 /* Initialize JFFS2 superblock locks, the further initialization will
170 * be done later */ 289 * be done later */
171 mutex_init(&c->alloc_sem); 290 mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268..b09e51d2f81 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
578 if (!jffs2_is_writebuffered(c)) 578 if (!jffs2_is_writebuffered(c))
579 return 0; 579 return 0;
580 580
581 if (mutex_trylock(&c->alloc_sem)) { 581 if (!mutex_is_locked(&c->alloc_sem)) {
582 mutex_unlock(&c->alloc_sem);
583 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); 582 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
584 BUG(); 583 BUG();
585 } 584 }
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1026 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1025 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1027 struct mtd_oob_ops ops; 1026 struct mtd_oob_ops ops;
1028 1027
1029 ops.mode = MTD_OOB_AUTO; 1028 ops.mode = MTD_OPS_AUTO_OOB;
1030 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; 1029 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
1031 ops.oobbuf = c->oobbuf; 1030 ops.oobbuf = c->oobbuf;
1032 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1031 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1069 struct mtd_oob_ops ops; 1068 struct mtd_oob_ops ops;
1070 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1069 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1071 1070
1072 ops.mode = MTD_OOB_AUTO; 1071 ops.mode = MTD_OPS_AUTO_OOB;
1073 ops.ooblen = cmlen; 1072 ops.ooblen = cmlen;
1074 ops.oobbuf = c->oobbuf; 1073 ops.oobbuf = c->oobbuf;
1075 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1074 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1095 struct mtd_oob_ops ops; 1094 struct mtd_oob_ops ops;
1096 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1095 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1097 1096
1098 ops.mode = MTD_OOB_AUTO; 1097 ops.mode = MTD_OPS_AUTO_OOB;
1099 ops.ooblen = cmlen; 1098 ops.ooblen = cmlen;
1100 ops.oobbuf = (uint8_t *)&oob_cleanmarker; 1099 ops.oobbuf = (uint8_t *)&oob_cleanmarker;
1101 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1100 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index b78b2f978f0..1b6f15f191b 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
457 /* read the page of fixed disk inode (AIT) in raw mode */ 457 /* read the page of fixed disk inode (AIT) in raw mode */
458 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); 458 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
459 if (mp == NULL) { 459 if (mp == NULL) {
460 ip->i_nlink = 1; /* Don't want iput() deleting it */ 460 set_nlink(ip, 1); /* Don't want iput() deleting it */
461 iput(ip); 461 iput(ip);
462 return (NULL); 462 return (NULL);
463 } 463 }
@@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
469 /* copy on-disk inode to in-memory inode */ 469 /* copy on-disk inode to in-memory inode */
470 if ((copy_from_dinode(dp, ip)) != 0) { 470 if ((copy_from_dinode(dp, ip)) != 0) {
471 /* handle bad return by returning NULL for ip */ 471 /* handle bad return by returning NULL for ip */
472 ip->i_nlink = 1; /* Don't want iput() deleting it */ 472 set_nlink(ip, 1); /* Don't want iput() deleting it */
473 iput(ip); 473 iput(ip);
474 /* release the page */ 474 /* release the page */
475 release_metapage(mp); 475 release_metapage(mp);
@@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3076 ip->i_mode |= 0001; 3076 ip->i_mode |= 0001;
3077 } 3077 }
3078 } 3078 }
3079 ip->i_nlink = le32_to_cpu(dip->di_nlink); 3079 set_nlink(ip, le32_to_cpu(dip->di_nlink));
3080 3080
3081 jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); 3081 jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
3082 if (sbi->uid == -1) 3082 if (sbi->uid == -1)
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 2686531e235..c1a3e603279 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -157,7 +157,7 @@ fail_drop:
157 dquot_drop(inode); 157 dquot_drop(inode);
158 inode->i_flags |= S_NOQUOTA; 158 inode->i_flags |= S_NOQUOTA;
159fail_unlock: 159fail_unlock:
160 inode->i_nlink = 0; 160 clear_nlink(inode);
161 unlock_new_inode(inode); 161 unlock_new_inode(inode);
162fail_put: 162fail_put:
163 iput(inode); 163 iput(inode);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e..cc5f811ed38 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
67#include <linux/buffer_head.h> /* for sync_blockdev() */ 67#include <linux/buffer_head.h> /* for sync_blockdev() */
68#include <linux/bio.h> 68#include <linux/bio.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/export.h>
70#include <linux/delay.h> 71#include <linux/delay.h>
71#include <linux/mutex.h> 72#include <linux/mutex.h>
72#include <linux/seq_file.h> 73#include <linux/seq_file.h>
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e17545e1566..a112ad96e47 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
172 mutex_unlock(&JFS_IP(dip)->commit_mutex); 172 mutex_unlock(&JFS_IP(dip)->commit_mutex);
173 if (rc) { 173 if (rc) {
174 free_ea_wmap(ip); 174 free_ea_wmap(ip);
175 ip->i_nlink = 0; 175 clear_nlink(ip);
176 unlock_new_inode(ip); 176 unlock_new_inode(ip);
177 iput(ip); 177 iput(ip);
178 } else { 178 } else {
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
292 goto out3; 292 goto out3;
293 } 293 }
294 294
295 ip->i_nlink = 2; /* for '.' */ 295 set_nlink(ip, 2); /* for '.' */
296 ip->i_op = &jfs_dir_inode_operations; 296 ip->i_op = &jfs_dir_inode_operations;
297 ip->i_fop = &jfs_dir_operations; 297 ip->i_fop = &jfs_dir_operations;
298 298
@@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
311 mutex_unlock(&JFS_IP(dip)->commit_mutex); 311 mutex_unlock(&JFS_IP(dip)->commit_mutex);
312 if (rc) { 312 if (rc) {
313 free_ea_wmap(ip); 313 free_ea_wmap(ip);
314 ip->i_nlink = 0; 314 clear_nlink(ip);
315 unlock_new_inode(ip); 315 unlock_new_inode(ip);
316 iput(ip); 316 iput(ip);
317 } else { 317 } else {
@@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry,
844 rc = txCommit(tid, 2, &iplist[0], 0); 844 rc = txCommit(tid, 2, &iplist[0], 0);
845 845
846 if (rc) { 846 if (rc) {
847 ip->i_nlink--; /* never instantiated */ 847 drop_nlink(ip); /* never instantiated */
848 iput(ip); 848 iput(ip);
849 } else 849 } else
850 d_instantiate(dentry, ip); 850 d_instantiate(dentry, ip);
@@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1048 mutex_unlock(&JFS_IP(dip)->commit_mutex); 1048 mutex_unlock(&JFS_IP(dip)->commit_mutex);
1049 if (rc) { 1049 if (rc) {
1050 free_ea_wmap(ip); 1050 free_ea_wmap(ip);
1051 ip->i_nlink = 0; 1051 clear_nlink(ip);
1052 unlock_new_inode(ip); 1052 unlock_new_inode(ip);
1053 iput(ip); 1053 iput(ip);
1054 } else { 1054 } else {
@@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1433 mutex_unlock(&JFS_IP(dir)->commit_mutex); 1433 mutex_unlock(&JFS_IP(dir)->commit_mutex);
1434 if (rc) { 1434 if (rc) {
1435 free_ea_wmap(ip); 1435 free_ea_wmap(ip);
1436 ip->i_nlink = 0; 1436 clear_nlink(ip);
1437 unlock_new_inode(ip); 1437 unlock_new_inode(ip);
1438 iput(ip); 1438 iput(ip);
1439 } else { 1439 } else {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 06c8a67cbe7..a44eff076c1 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
485 goto out_unload; 485 goto out_unload;
486 } 486 }
487 inode->i_ino = 0; 487 inode->i_ino = 0;
488 inode->i_nlink = 1;
489 inode->i_size = sb->s_bdev->bd_inode->i_size; 488 inode->i_size = sb->s_bdev->bd_inode->i_size;
490 inode->i_mapping->a_ops = &jfs_metapage_aops; 489 inode->i_mapping->a_ops = &jfs_metapage_aops;
491 insert_inode_hash(inode); 490 insert_inode_hash(inode);
diff --git a/fs/libfs.c b/fs/libfs.c
index c18e9a1235b..f6d411eef1e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
490 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 490 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
491 inode->i_op = &simple_dir_inode_operations; 491 inode->i_op = &simple_dir_inode_operations;
492 inode->i_fop = &simple_dir_operations; 492 inode->i_fop = &simple_dir_operations;
493 inode->i_nlink = 2; 493 set_nlink(inode, 2);
494 root = d_alloc_root(inode); 494 root = d_alloc_root(inode);
495 if (!root) { 495 if (!root) {
496 iput(inode); 496 iput(inode);
@@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
510 if (!dentry) 510 if (!dentry)
511 goto out; 511 goto out;
512 inode = new_inode(s); 512 inode = new_inode(s);
513 if (!inode) 513 if (!inode) {
514 dput(dentry);
514 goto out; 515 goto out;
516 }
515 inode->i_mode = S_IFREG | files->mode; 517 inode->i_mode = S_IFREG | files->mode;
516 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 518 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
517 inode->i_fop = files->ops; 519 inode->i_fop = files->ops;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da..6f29836ec0c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
316 struct hlist_node *pos; 316 struct hlist_node *pos;
317 struct nlm_host *host = NULL; 317 struct nlm_host *host = NULL;
318 struct nsm_handle *nsm = NULL; 318 struct nsm_handle *nsm = NULL;
319 struct sockaddr_in sin = { 319 struct sockaddr *src_sap = svc_daddr(rqstp);
320 .sin_family = AF_INET, 320 size_t src_len = rqstp->rq_daddrlen;
321 };
322 struct sockaddr_in6 sin6 = {
323 .sin6_family = AF_INET6,
324 };
325 struct sockaddr *src_sap;
326 size_t src_len = rqstp->rq_addrlen;
327 struct nlm_lookup_host_info ni = { 321 struct nlm_lookup_host_info ni = {
328 .server = 1, 322 .server = 1,
329 .sap = svc_addr(rqstp), 323 .sap = svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
340 334
341 mutex_lock(&nlm_host_mutex); 335 mutex_lock(&nlm_host_mutex);
342 336
343 switch (ni.sap->sa_family) {
344 case AF_INET:
345 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
346 src_sap = (struct sockaddr *)&sin;
347 break;
348 case AF_INET6:
349 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
350 src_sap = (struct sockaddr *)&sin6;
351 break;
352 default:
353 dprintk("lockd: %s failed; unrecognized address family\n",
354 __func__);
355 goto out;
356 }
357
358 if (time_after_eq(jiffies, next_gc)) 337 if (time_after_eq(jiffies, next_gc))
359 nlm_gc_hosts(); 338 nlm_gc_hosts();
360 339
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979..c061b9aa7dd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
282 /* 282 /*
283 * Create the kernel thread and wait for it to start. 283 * Create the kernel thread and wait for it to start.
284 */ 284 */
285 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 285 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
286 if (IS_ERR(nlmsvc_rqst)) { 286 if (IS_ERR(nlmsvc_rqst)) {
287 error = PTR_ERR(nlmsvc_rqst); 287 error = PTR_ERR(nlmsvc_rqst);
288 nlmsvc_rqst = NULL; 288 nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 96b33989147..3b0d05dcd7c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -133,6 +133,20 @@
133#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 133#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
134#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) 134#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
135 135
136static bool lease_breaking(struct file_lock *fl)
137{
138 return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
139}
140
141static int target_leasetype(struct file_lock *fl)
142{
143 if (fl->fl_flags & FL_UNLOCK_PENDING)
144 return F_UNLCK;
145 if (fl->fl_flags & FL_DOWNGRADE_PENDING)
146 return F_RDLCK;
147 return fl->fl_type;
148}
149
136int leases_enable = 1; 150int leases_enable = 1;
137int lease_break_time = 45; 151int lease_break_time = 45;
138 152
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1119 1133
1120EXPORT_SYMBOL(locks_mandatory_area); 1134EXPORT_SYMBOL(locks_mandatory_area);
1121 1135
1136static void lease_clear_pending(struct file_lock *fl, int arg)
1137{
1138 switch (arg) {
1139 case F_UNLCK:
1140 fl->fl_flags &= ~FL_UNLOCK_PENDING;
1141 /* fall through: */
1142 case F_RDLCK:
1143 fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
1144 }
1145}
1146
1122/* We already had a lease on this file; just change its type */ 1147/* We already had a lease on this file; just change its type */
1123int lease_modify(struct file_lock **before, int arg) 1148int lease_modify(struct file_lock **before, int arg)
1124{ 1149{
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
1127 1152
1128 if (error) 1153 if (error)
1129 return error; 1154 return error;
1155 lease_clear_pending(fl, arg);
1130 locks_wake_up_blocks(fl); 1156 locks_wake_up_blocks(fl);
1131 if (arg == F_UNLCK) 1157 if (arg == F_UNLCK)
1132 locks_delete_lock(before); 1158 locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
1135 1161
1136EXPORT_SYMBOL(lease_modify); 1162EXPORT_SYMBOL(lease_modify);
1137 1163
1164static bool past_time(unsigned long then)
1165{
1166 if (!then)
1167 /* 0 is a special value meaning "this never expires": */
1168 return false;
1169 return time_after(jiffies, then);
1170}
1171
1138static void time_out_leases(struct inode *inode) 1172static void time_out_leases(struct inode *inode)
1139{ 1173{
1140 struct file_lock **before; 1174 struct file_lock **before;
1141 struct file_lock *fl; 1175 struct file_lock *fl;
1142 1176
1143 before = &inode->i_flock; 1177 before = &inode->i_flock;
1144 while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { 1178 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
1145 if ((fl->fl_break_time == 0) 1179 if (past_time(fl->fl_downgrade_time))
1146 || time_before(jiffies, fl->fl_break_time)) { 1180 lease_modify(before, F_RDLCK);
1147 before = &fl->fl_next; 1181 if (past_time(fl->fl_break_time))
1148 continue; 1182 lease_modify(before, F_UNLCK);
1149 }
1150 lease_modify(before, fl->fl_type & ~F_INPROGRESS);
1151 if (fl == *before) /* lease_modify may have freed fl */ 1183 if (fl == *before) /* lease_modify may have freed fl */
1152 before = &fl->fl_next; 1184 before = &fl->fl_next;
1153 } 1185 }
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
1165 */ 1197 */
1166int __break_lease(struct inode *inode, unsigned int mode) 1198int __break_lease(struct inode *inode, unsigned int mode)
1167{ 1199{
1168 int error = 0, future; 1200 int error = 0;
1169 struct file_lock *new_fl, *flock; 1201 struct file_lock *new_fl, *flock;
1170 struct file_lock *fl; 1202 struct file_lock *fl;
1171 unsigned long break_time; 1203 unsigned long break_time;
@@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 if ((flock == NULL) || !IS_LEASE(flock)) 1214 if ((flock == NULL) || !IS_LEASE(flock))
1183 goto out; 1215 goto out;
1184 1216
1217 if (!locks_conflict(flock, new_fl))
1218 goto out;
1219
1185 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) 1220 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
1186 if (fl->fl_owner == current->files) 1221 if (fl->fl_owner == current->files)
1187 i_have_this_lease = 1; 1222 i_have_this_lease = 1;
1188 1223
1189 if (want_write) {
1190 /* If we want write access, we have to revoke any lease. */
1191 future = F_UNLCK | F_INPROGRESS;
1192 } else if (flock->fl_type & F_INPROGRESS) {
1193 /* If the lease is already being broken, we just leave it */
1194 future = flock->fl_type;
1195 } else if (flock->fl_type & F_WRLCK) {
1196 /* Downgrade the exclusive lease to a read-only lease. */
1197 future = F_RDLCK | F_INPROGRESS;
1198 } else {
1199 /* the existing lease was read-only, so we can read too. */
1200 goto out;
1201 }
1202
1203 if (IS_ERR(new_fl) && !i_have_this_lease 1224 if (IS_ERR(new_fl) && !i_have_this_lease
1204 && ((mode & O_NONBLOCK) == 0)) { 1225 && ((mode & O_NONBLOCK) == 0)) {
1205 error = PTR_ERR(new_fl); 1226 error = PTR_ERR(new_fl);
@@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
1214 } 1235 }
1215 1236
1216 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { 1237 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
1217 if (fl->fl_type != future) { 1238 if (want_write) {
1218 fl->fl_type = future; 1239 if (fl->fl_flags & FL_UNLOCK_PENDING)
1240 continue;
1241 fl->fl_flags |= FL_UNLOCK_PENDING;
1219 fl->fl_break_time = break_time; 1242 fl->fl_break_time = break_time;
1220 /* lease must have lmops break callback */ 1243 } else {
1221 fl->fl_lmops->lm_break(fl); 1244 if (lease_breaking(flock))
1245 continue;
1246 fl->fl_flags |= FL_DOWNGRADE_PENDING;
1247 fl->fl_downgrade_time = break_time;
1222 } 1248 }
1249 fl->fl_lmops->lm_break(fl);
1223 } 1250 }
1224 1251
1225 if (i_have_this_lease || (mode & O_NONBLOCK)) { 1252 if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1270,13 @@ restart:
1243 if (error >= 0) { 1270 if (error >= 0) {
1244 if (error == 0) 1271 if (error == 0)
1245 time_out_leases(inode); 1272 time_out_leases(inode);
1246 /* Wait for the next lease that has not been broken yet */ 1273 /*
1274 * Wait for the next conflicting lease that has not been
1275 * broken yet
1276 */
1247 for (flock = inode->i_flock; flock && IS_LEASE(flock); 1277 for (flock = inode->i_flock; flock && IS_LEASE(flock);
1248 flock = flock->fl_next) { 1278 flock = flock->fl_next) {
1249 if (flock->fl_type & F_INPROGRESS) 1279 if (locks_conflict(new_fl, flock))
1250 goto restart; 1280 goto restart;
1251 } 1281 }
1252 error = 0; 1282 error = 0;
@@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
1314 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1344 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1315 fl = fl->fl_next) { 1345 fl = fl->fl_next) {
1316 if (fl->fl_file == filp) { 1346 if (fl->fl_file == filp) {
1317 type = fl->fl_type & ~F_INPROGRESS; 1347 type = target_leasetype(fl);
1318 break; 1348 break;
1319 } 1349 }
1320 } 1350 }
@@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp)
1322 return type; 1352 return type;
1323} 1353}
1324 1354
1325/** 1355int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1326 * generic_setlease - sets a lease on an open file
1327 * @filp: file pointer
1328 * @arg: type of lease to obtain
1329 * @flp: input - file_lock to use, output - file_lock inserted
1330 *
1331 * The (input) flp->fl_lmops->lm_break function is required
1332 * by break_lease().
1333 *
1334 * Called with file_lock_lock held.
1335 */
1336int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1337{ 1356{
1338 struct file_lock *fl, **before, **my_before = NULL, *lease; 1357 struct file_lock *fl, **before, **my_before = NULL, *lease;
1339 struct dentry *dentry = filp->f_path.dentry; 1358 struct dentry *dentry = filp->f_path.dentry;
1340 struct inode *inode = dentry->d_inode; 1359 struct inode *inode = dentry->d_inode;
1341 int error, rdlease_count = 0, wrlease_count = 0; 1360 int error;
1342 1361
1343 lease = *flp; 1362 lease = *flp;
1344 1363
1345 error = -EACCES; 1364 error = -EAGAIN;
1346 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1365 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1347 goto out;
1348 error = -EINVAL;
1349 if (!S_ISREG(inode->i_mode))
1350 goto out; 1366 goto out;
1351 error = security_file_lock(filp, arg); 1367 if ((arg == F_WRLCK)
1352 if (error) 1368 && ((dentry->d_count > 1)
1369 || (atomic_read(&inode->i_count) > 1)))
1353 goto out; 1370 goto out;
1354 1371
1355 time_out_leases(inode);
1356
1357 BUG_ON(!(*flp)->fl_lmops->lm_break);
1358
1359 if (arg != F_UNLCK) {
1360 error = -EAGAIN;
1361 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1362 goto out;
1363 if ((arg == F_WRLCK)
1364 && ((dentry->d_count > 1)
1365 || (atomic_read(&inode->i_count) > 1)))
1366 goto out;
1367 }
1368
1369 /* 1372 /*
1370 * At this point, we know that if there is an exclusive 1373 * At this point, we know that if there is an exclusive
1371 * lease on this file, then we hold it on this filp 1374 * lease on this file, then we hold it on this filp
@@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1374 * then the file is not open by anyone (including us) 1377 * then the file is not open by anyone (including us)
1375 * except for this filp. 1378 * except for this filp.
1376 */ 1379 */
1380 error = -EAGAIN;
1377 for (before = &inode->i_flock; 1381 for (before = &inode->i_flock;
1378 ((fl = *before) != NULL) && IS_LEASE(fl); 1382 ((fl = *before) != NULL) && IS_LEASE(fl);
1379 before = &fl->fl_next) { 1383 before = &fl->fl_next) {
1380 if (fl->fl_file == filp) 1384 if (fl->fl_file == filp) {
1381 my_before = before; 1385 my_before = before;
1382 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1386 continue;
1383 /* 1387 }
1384 * Someone is in the process of opening this 1388 /*
1385 * file for writing so we may not take an 1389 * No exclusive leases if someone else has a lease on
1386 * exclusive lease on it. 1390 * this file:
1387 */ 1391 */
1388 wrlease_count++; 1392 if (arg == F_WRLCK)
1389 else 1393 goto out;
1390 rdlease_count++; 1394 /*
1395 * Modifying our existing lease is OK, but no getting a
1396 * new lease if someone else is opening for write:
1397 */
1398 if (fl->fl_flags & FL_UNLOCK_PENDING)
1399 goto out;
1391 } 1400 }
1392 1401
1393 error = -EAGAIN;
1394 if ((arg == F_RDLCK && (wrlease_count > 0)) ||
1395 (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
1396 goto out;
1397
1398 if (my_before != NULL) { 1402 if (my_before != NULL) {
1399 error = lease->fl_lmops->lm_change(my_before, arg); 1403 error = lease->fl_lmops->lm_change(my_before, arg);
1400 if (!error) 1404 if (!error)
@@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1402 goto out; 1406 goto out;
1403 } 1407 }
1404 1408
1405 if (arg == F_UNLCK)
1406 goto out;
1407
1408 error = -EINVAL; 1409 error = -EINVAL;
1409 if (!leases_enable) 1410 if (!leases_enable)
1410 goto out; 1411 goto out;
@@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1415out: 1416out:
1416 return error; 1417 return error;
1417} 1418}
1419
1420int generic_delete_lease(struct file *filp, struct file_lock **flp)
1421{
1422 struct file_lock *fl, **before;
1423 struct dentry *dentry = filp->f_path.dentry;
1424 struct inode *inode = dentry->d_inode;
1425
1426 for (before = &inode->i_flock;
1427 ((fl = *before) != NULL) && IS_LEASE(fl);
1428 before = &fl->fl_next) {
1429 if (fl->fl_file != filp)
1430 continue;
1431 return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
1432 }
1433 return -EAGAIN;
1434}
1435
1436/**
1437 * generic_setlease - sets a lease on an open file
1438 * @filp: file pointer
1439 * @arg: type of lease to obtain
1440 * @flp: input - file_lock to use, output - file_lock inserted
1441 *
1442 * The (input) flp->fl_lmops->lm_break function is required
1443 * by break_lease().
1444 *
1445 * Called with file_lock_lock held.
1446 */
1447int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1448{
1449 struct dentry *dentry = filp->f_path.dentry;
1450 struct inode *inode = dentry->d_inode;
1451 int error;
1452
1453 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1454 return -EACCES;
1455 if (!S_ISREG(inode->i_mode))
1456 return -EINVAL;
1457 error = security_file_lock(filp, arg);
1458 if (error)
1459 return error;
1460
1461 time_out_leases(inode);
1462
1463 BUG_ON(!(*flp)->fl_lmops->lm_break);
1464
1465 switch (arg) {
1466 case F_UNLCK:
1467 return generic_delete_lease(filp, flp);
1468 case F_RDLCK:
1469 case F_WRLCK:
1470 return generic_add_lease(filp, arg, flp);
1471 default:
1472 BUG();
1473 }
1474}
1418EXPORT_SYMBOL(generic_setlease); 1475EXPORT_SYMBOL(generic_setlease);
1419 1476
1420static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1477static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2126 } 2183 }
2127 } else if (IS_LEASE(fl)) { 2184 } else if (IS_LEASE(fl)) {
2128 seq_printf(f, "LEASE "); 2185 seq_printf(f, "LEASE ");
2129 if (fl->fl_type & F_INPROGRESS) 2186 if (lease_breaking(fl))
2130 seq_printf(f, "BREAKING "); 2187 seq_printf(f, "BREAKING ");
2131 else if (fl->fl_file) 2188 else if (fl->fl_file)
2132 seq_printf(f, "ACTIVE "); 2189 seq_printf(f, "ACTIVE ");
@@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2142 : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); 2199 : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
2143 } else { 2200 } else {
2144 seq_printf(f, "%s ", 2201 seq_printf(f, "%s ",
2145 (fl->fl_type & F_INPROGRESS) 2202 (lease_breaking(fl))
2146 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " 2203 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
2147 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); 2204 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
2148 } 2205 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b3ff3d89416..b7d7f67cee5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode)
197{ 197{
198 int ret; 198 int ret;
199 199
200 inode->i_nlink--; 200 drop_nlink(inode);
201 ret = write_inode(inode); 201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb); 202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret; 203 return ret;
@@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
433 433
434 ta = kzalloc(sizeof(*ta), GFP_KERNEL); 434 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
435 if (!ta) { 435 if (!ta) {
436 inode->i_nlink--; 436 drop_nlink(inode);
437 iput(inode); 437 iput(inode);
438 return -ENOMEM; 438 return -ENOMEM;
439 } 439 }
@@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
456 abort_transaction(inode, ta); 456 abort_transaction(inode, ta);
457 li->li_flags |= LOGFS_IF_STILLBORN; 457 li->li_flags |= LOGFS_IF_STILLBORN;
458 /* FIXME: truncate symlink */ 458 /* FIXME: truncate symlink */
459 inode->i_nlink--; 459 drop_nlink(inode);
460 iput(inode); 460 iput(inode);
461 goto out; 461 goto out;
462 } 462 }
@@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
563 563
564 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 564 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
565 ihold(inode); 565 ihold(inode);
566 inode->i_nlink++; 566 inc_nlink(inode);
567 mark_inode_dirty_sync(inode); 567 mark_inode_dirty_sync(inode);
568 568
569 return __logfs_create(dir, dentry, inode, NULL, 0); 569 return __logfs_create(dir, dentry, inode, NULL, 0);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index edfea7a3a74..7e441ad5f79 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
93 /* inode->i_nlink == 0 can be true when called from 93 /* inode->i_nlink == 0 can be true when called from
94 * block validator */ 94 * block validator */
95 /* set i_nlink to 0 to prevent caching */ 95 /* set i_nlink to 0 to prevent caching */
96 inode->i_nlink = 0; 96 clear_nlink(inode);
97 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; 97 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
98 iget_failed(inode); 98 iget_failed(inode);
99 if (!err) 99 if (!err)
@@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
199 inode->i_blocks = 0; 199 inode->i_blocks = 0;
200 inode->i_ctime = CURRENT_TIME; 200 inode->i_ctime = CURRENT_TIME;
201 inode->i_mtime = CURRENT_TIME; 201 inode->i_mtime = CURRENT_TIME;
202 inode->i_nlink = 1;
203 li->li_refcount = 1; 202 li->li_refcount = 1;
204 INIT_LIST_HEAD(&li->li_freeing_list); 203 INIT_LIST_HEAD(&li->li_freeing_list);
205 204
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index f22d108bfa5..398ecff6e54 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
618struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); 618struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
619void emergency_read_end(struct page *page); 619void emergency_read_end(struct page *page);
620void logfs_crash_dump(struct super_block *sb); 620void logfs_crash_dump(struct super_block *sb);
621void *memchr_inv(const void *s, int c, size_t n);
622int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); 621int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
623int logfs_check_ds(struct logfs_disk_super *ds); 622int logfs_check_ds(struct logfs_disk_super *ds);
624int logfs_write_sb(struct super_block *sb); 623int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index d8d09380c7d..2ac4217b790 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
126 inode->i_atime = be64_to_timespec(di->di_atime); 126 inode->i_atime = be64_to_timespec(di->di_atime);
127 inode->i_ctime = be64_to_timespec(di->di_ctime); 127 inode->i_ctime = be64_to_timespec(di->di_ctime);
128 inode->i_mtime = be64_to_timespec(di->di_mtime); 128 inode->i_mtime = be64_to_timespec(di->di_mtime);
129 inode->i_nlink = be32_to_cpu(di->di_refcount); 129 set_nlink(inode, be32_to_cpu(di->di_refcount));
130 inode->i_generation = be32_to_cpu(di->di_generation); 130 inode->i_generation = be32_to_cpu(di->di_generation);
131 131
132 switch (inode->i_mode & S_IFMT) { 132 switch (inode->i_mode & S_IFMT) {
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index ce03a182c77..e795c234ea3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/module.h>
16#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
@@ -91,28 +92,6 @@ void logfs_crash_dump(struct super_block *sb)
91} 92}
92 93
93/* 94/*
94 * TODO: move to lib/string.c
95 */
96/**
97 * memchr_inv - Find a character in an area of memory.
98 * @s: The memory area
99 * @c: The byte to search for
100 * @n: The size of the area.
101 *
102 * returns the address of the first character other than @c, or %NULL
103 * if the whole buffer contains just @c.
104 */
105void *memchr_inv(const void *s, int c, size_t n)
106{
107 const unsigned char *p = s;
108 while (n-- != 0)
109 if ((unsigned char)c != *p++)
110 return (void *)(p - 1);
111
112 return NULL;
113}
114
115/*
116 * FIXME: There should be a reserve for root, similar to ext2. 95 * FIXME: There should be a reserve for root, similar to ext2.
117 */ 96 */
118int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) 97int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d23e25bf1..64cdcd662ff 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -446,7 +446,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
446 inode->i_mode = raw_inode->i_mode; 446 inode->i_mode = raw_inode->i_mode;
447 inode->i_uid = (uid_t)raw_inode->i_uid; 447 inode->i_uid = (uid_t)raw_inode->i_uid;
448 inode->i_gid = (gid_t)raw_inode->i_gid; 448 inode->i_gid = (gid_t)raw_inode->i_gid;
449 inode->i_nlink = raw_inode->i_nlinks; 449 set_nlink(inode, raw_inode->i_nlinks);
450 inode->i_size = raw_inode->i_size; 450 inode->i_size = raw_inode->i_size;
451 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; 451 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
452 inode->i_mtime.tv_nsec = 0; 452 inode->i_mtime.tv_nsec = 0;
@@ -479,7 +479,7 @@ static struct inode *V2_minix_iget(struct inode *inode)
479 inode->i_mode = raw_inode->i_mode; 479 inode->i_mode = raw_inode->i_mode;
480 inode->i_uid = (uid_t)raw_inode->i_uid; 480 inode->i_uid = (uid_t)raw_inode->i_uid;
481 inode->i_gid = (gid_t)raw_inode->i_gid; 481 inode->i_gid = (gid_t)raw_inode->i_gid;
482 inode->i_nlink = raw_inode->i_nlinks; 482 set_nlink(inode, raw_inode->i_nlinks);
483 inode->i_size = raw_inode->i_size; 483 inode->i_size = raw_inode->i_size;
484 inode->i_mtime.tv_sec = raw_inode->i_mtime; 484 inode->i_mtime.tv_sec = raw_inode->i_mtime;
485 inode->i_atime.tv_sec = raw_inode->i_atime; 485 inode->i_atime.tv_sec = raw_inode->i_atime;
diff --git a/fs/namei.c b/fs/namei.c
index 0b3138de2a3..5008f01787f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -137,7 +137,7 @@ static int do_getname(const char __user *filename, char *page)
137 return retval; 137 return retval;
138} 138}
139 139
140static char *getname_flags(const char __user * filename, int flags) 140static char *getname_flags(const char __user *filename, int flags, int *empty)
141{ 141{
142 char *tmp, *result; 142 char *tmp, *result;
143 143
@@ -148,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags)
148 148
149 result = tmp; 149 result = tmp;
150 if (retval < 0) { 150 if (retval < 0) {
151 if (retval == -ENOENT && empty)
152 *empty = 1;
151 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 153 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
152 __putname(tmp); 154 __putname(tmp);
153 result = ERR_PTR(retval); 155 result = ERR_PTR(retval);
@@ -160,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags)
160 162
161char *getname(const char __user * filename) 163char *getname(const char __user * filename)
162{ 164{
163 return getname_flags(filename, 0); 165 return getname_flags(filename, 0, 0);
164} 166}
165 167
166#ifdef CONFIG_AUDITSYSCALL 168#ifdef CONFIG_AUDITSYSCALL
@@ -221,14 +223,12 @@ static int check_acl(struct inode *inode, int mask)
221} 223}
222 224
223/* 225/*
224 * This does basic POSIX ACL permission checking 226 * This does the basic permission checking
225 */ 227 */
226static int acl_permission_check(struct inode *inode, int mask) 228static int acl_permission_check(struct inode *inode, int mask)
227{ 229{
228 unsigned int mode = inode->i_mode; 230 unsigned int mode = inode->i_mode;
229 231
230 mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
231
232 if (current_user_ns() != inode_userns(inode)) 232 if (current_user_ns() != inode_userns(inode))
233 goto other_perms; 233 goto other_perms;
234 234
@@ -257,7 +257,7 @@ other_perms:
257/** 257/**
258 * generic_permission - check for access rights on a Posix-like filesystem 258 * generic_permission - check for access rights on a Posix-like filesystem
259 * @inode: inode to check access rights for 259 * @inode: inode to check access rights for
260 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 260 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
261 * 261 *
262 * Used to check for read/write/execute permissions on a file. 262 * Used to check for read/write/execute permissions on a file.
263 * We use "fsuid" for this, letting us set arbitrary permissions 263 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +273,7 @@ int generic_permission(struct inode *inode, int mask)
273 int ret; 273 int ret;
274 274
275 /* 275 /*
276 * Do the basic POSIX ACL permission checks. 276 * Do the basic permission checks.
277 */ 277 */
278 ret = acl_permission_check(inode, mask); 278 ret = acl_permission_check(inode, mask);
279 if (ret != -EACCES) 279 if (ret != -EACCES)
@@ -331,12 +331,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
331/** 331/**
332 * inode_permission - check for access rights to a given inode 332 * inode_permission - check for access rights to a given inode
333 * @inode: inode to check permission on 333 * @inode: inode to check permission on
334 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 334 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
335 * 335 *
336 * Used to check for read/write/execute permissions on an inode. 336 * Used to check for read/write/execute permissions on an inode.
337 * We use "fsuid" for this, letting us set arbitrary permissions 337 * We use "fsuid" for this, letting us set arbitrary permissions
338 * for filesystem access without changing the "normal" uids which 338 * for filesystem access without changing the "normal" uids which
339 * are used for other things. 339 * are used for other things.
340 *
341 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
340 */ 342 */
341int inode_permission(struct inode *inode, int mask) 343int inode_permission(struct inode *inode, int mask)
342{ 344{
@@ -850,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
850 mntput(path->mnt); 852 mntput(path->mnt);
851 if (ret == -EISDIR) 853 if (ret == -EISDIR)
852 ret = 0; 854 ret = 0;
853 return ret; 855 return ret < 0 ? ret : need_mntput;
854} 856}
855 857
856int follow_down_one(struct path *path) 858int follow_down_one(struct path *path)
@@ -898,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
898 break; 900 break;
899 path->mnt = mounted; 901 path->mnt = mounted;
900 path->dentry = mounted->mnt_root; 902 path->dentry = mounted->mnt_root;
903 nd->flags |= LOOKUP_JUMPED;
901 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 904 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
902 /* 905 /*
903 * Update the inode too. We don't need to re-check the 906 * Update the inode too. We don't need to re-check the
@@ -1211,6 +1214,8 @@ retry:
1211 path_put_conditional(path, nd); 1214 path_put_conditional(path, nd);
1212 return err; 1215 return err;
1213 } 1216 }
1217 if (err)
1218 nd->flags |= LOOKUP_JUMPED;
1214 *inode = path->dentry->d_inode; 1219 *inode = path->dentry->d_inode;
1215 return 0; 1220 return 0;
1216} 1221}
@@ -1798,11 +1803,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1798 return __lookup_hash(&this, base, NULL); 1803 return __lookup_hash(&this, base, NULL);
1799} 1804}
1800 1805
1801int user_path_at(int dfd, const char __user *name, unsigned flags, 1806int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
1802 struct path *path) 1807 struct path *path, int *empty)
1803{ 1808{
1804 struct nameidata nd; 1809 struct nameidata nd;
1805 char *tmp = getname_flags(name, flags); 1810 char *tmp = getname_flags(name, flags, empty);
1806 int err = PTR_ERR(tmp); 1811 int err = PTR_ERR(tmp);
1807 if (!IS_ERR(tmp)) { 1812 if (!IS_ERR(tmp)) {
1808 1813
@@ -1816,6 +1821,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
1816 return err; 1821 return err;
1817} 1822}
1818 1823
1824int user_path_at(int dfd, const char __user *name, unsigned flags,
1825 struct path *path)
1826{
1827 return user_path_at_empty(dfd, name, flags, path, 0);
1828}
1829
1819static int user_path_parent(int dfd, const char __user *path, 1830static int user_path_parent(int dfd, const char __user *path,
1820 struct nameidata *nd, char **name) 1831 struct nameidata *nd, char **name)
1821{ 1832{
@@ -2035,10 +2046,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
2035 if (flag & O_NOATIME && !inode_owner_or_capable(inode)) 2046 if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2036 return -EPERM; 2047 return -EPERM;
2037 2048
2038 /* 2049 return 0;
2039 * Ensure there are no outstanding leases on the file.
2040 */
2041 return break_lease(inode, flag);
2042} 2050}
2043 2051
2044static int handle_truncate(struct file *filp) 2052static int handle_truncate(struct file *filp)
@@ -2141,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2141 } 2149 }
2142 2150
2143 /* create side of things */ 2151 /* create side of things */
2152 /*
2153 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
2154 * cleared when we got to the last component we are about to look up
2155 */
2144 error = complete_walk(nd); 2156 error = complete_walk(nd);
2145 if (error) 2157 if (error)
2146 return ERR_PTR(error); 2158 return ERR_PTR(error);
@@ -2209,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2209 if (error < 0) 2221 if (error < 0)
2210 goto exit_dput; 2222 goto exit_dput;
2211 2223
2224 if (error)
2225 nd->flags |= LOOKUP_JUMPED;
2226
2212 error = -ENOENT; 2227 error = -ENOENT;
2213 if (!path->dentry->d_inode) 2228 if (!path->dentry->d_inode)
2214 goto exit_dput; 2229 goto exit_dput;
@@ -2218,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2218 2233
2219 path_to_nameidata(path, nd); 2234 path_to_nameidata(path, nd);
2220 nd->inode = path->dentry->d_inode; 2235 nd->inode = path->dentry->d_inode;
2236 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2237 error = complete_walk(nd);
2238 if (error)
2239 goto exit;
2221 error = -EISDIR; 2240 error = -EISDIR;
2222 if (S_ISDIR(nd->inode->i_mode)) 2241 if (S_ISDIR(nd->inode->i_mode))
2223 goto exit; 2242 goto exit;
diff --git a/fs/namespace.c b/fs/namespace.c
index 9a1ddcda655..10a426c6a70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1107,6 +1107,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
1107 1107
1108 /* device */ 1108 /* device */
1109 if (mnt->mnt_sb->s_op->show_devname) { 1109 if (mnt->mnt_sb->s_op->show_devname) {
1110 seq_puts(m, "device ");
1110 err = mnt->mnt_sb->s_op->show_devname(m, mnt); 1111 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1111 } else { 1112 } else {
1112 if (mnt->mnt_devname) { 1113 if (mnt->mnt_devname) {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 202f370526a..5b5fa33b6b9 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
228 228
229 DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); 229 DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
230 230
231 inode->i_nlink = 1; 231 set_nlink(inode, 1);
232 inode->i_uid = server->m.uid; 232 inode->i_uid = server->m.uid;
233 inode->i_gid = server->m.gid; 233 inode->i_gid = server->m.gid;
234 234
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8fc8bd..281ae95932c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -176,17 +176,6 @@ retry:
176 return bio; 176 return bio;
177} 177}
178 178
179static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
180{
181 if (lseg->pls_range.iomode == IOMODE_RW) {
182 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
183 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
184 } else {
185 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
186 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
187 }
188}
189
190/* This is basically copied from mpage_end_io_read */ 179/* This is basically copied from mpage_end_io_read */
191static void bl_end_io_read(struct bio *bio, int err) 180static void bl_end_io_read(struct bio *bio, int err)
192{ 181{
@@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
206 if (!uptodate) { 195 if (!uptodate) {
207 if (!rdata->pnfs_error) 196 if (!rdata->pnfs_error)
208 rdata->pnfs_error = -EIO; 197 rdata->pnfs_error = -EIO;
209 bl_set_lo_fail(rdata->lseg); 198 pnfs_set_lo_fail(rdata->lseg);
210 } 199 }
211 bio_put(bio); 200 bio_put(bio);
212 put_parallel(par); 201 put_parallel(par);
@@ -303,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
303 bl_end_io_read, par); 292 bl_end_io_read, par);
304 if (IS_ERR(bio)) { 293 if (IS_ERR(bio)) {
305 rdata->pnfs_error = PTR_ERR(bio); 294 rdata->pnfs_error = PTR_ERR(bio);
295 bio = NULL;
306 goto out; 296 goto out;
307 } 297 }
308 } 298 }
@@ -370,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
370 if (!uptodate) { 360 if (!uptodate) {
371 if (!wdata->pnfs_error) 361 if (!wdata->pnfs_error)
372 wdata->pnfs_error = -EIO; 362 wdata->pnfs_error = -EIO;
373 bl_set_lo_fail(wdata->lseg); 363 pnfs_set_lo_fail(wdata->lseg);
374 } 364 }
375 bio_put(bio); 365 bio_put(bio);
376 put_parallel(par); 366 put_parallel(par);
@@ -386,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err)
386 if (!uptodate) { 376 if (!uptodate) {
387 if (!wdata->pnfs_error) 377 if (!wdata->pnfs_error)
388 wdata->pnfs_error = -EIO; 378 wdata->pnfs_error = -EIO;
389 bl_set_lo_fail(wdata->lseg); 379 pnfs_set_lo_fail(wdata->lseg);
390 } 380 }
391 bio_put(bio); 381 bio_put(bio);
392 put_parallel(par); 382 put_parallel(par);
@@ -543,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
543fill_invalid_ext: 533fill_invalid_ext:
544 dprintk("%s need to zero %d pages\n", __func__, npg_zero); 534 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
545 for (;npg_zero > 0; npg_zero--) { 535 for (;npg_zero > 0; npg_zero--) {
536 if (bl_is_sector_init(be->be_inval, isect)) {
537 dprintk("isect %llu already init\n",
538 (unsigned long long)isect);
539 goto next_page;
540 }
546 /* page ref released in bl_end_io_write_zero */ 541 /* page ref released in bl_end_io_write_zero */
547 index = isect >> PAGE_CACHE_SECTOR_SHIFT; 542 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
548 dprintk("%s zero %dth page: index %lu isect %llu\n", 543 dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -562,8 +557,7 @@ fill_invalid_ext:
562 * PageUptodate: It was read before 557 * PageUptodate: It was read before
563 * sector_initialized: already written out 558 * sector_initialized: already written out
564 */ 559 */
565 if (PageDirty(page) || PageWriteback(page) || 560 if (PageDirty(page) || PageWriteback(page)) {
566 bl_is_sector_init(be->be_inval, isect)) {
567 print_page(page); 561 print_page(page);
568 unlock_page(page); 562 unlock_page(page);
569 page_cache_release(page); 563 page_cache_release(page);
@@ -592,6 +586,7 @@ fill_invalid_ext:
592 bl_end_io_write_zero, par); 586 bl_end_io_write_zero, par);
593 if (IS_ERR(bio)) { 587 if (IS_ERR(bio)) {
594 wdata->pnfs_error = PTR_ERR(bio); 588 wdata->pnfs_error = PTR_ERR(bio);
589 bio = NULL;
595 goto out; 590 goto out;
596 } 591 }
597 /* FIXME: This should be done in bi_end_io */ 592 /* FIXME: This should be done in bi_end_io */
@@ -640,6 +635,7 @@ next_page:
640 bl_end_io_write, par); 635 bl_end_io_write, par);
641 if (IS_ERR(bio)) { 636 if (IS_ERR(bio)) {
642 wdata->pnfs_error = PTR_ERR(bio); 637 wdata->pnfs_error = PTR_ERR(bio);
638 bio = NULL;
643 goto out; 639 goto out;
644 } 640 }
645 isect += PAGE_CACHE_SECTORS; 641 isect += PAGE_CACHE_SECTORS;
@@ -805,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
805 struct nfs4_deviceid *d_id) 801 struct nfs4_deviceid *d_id)
806{ 802{
807 struct pnfs_device *dev; 803 struct pnfs_device *dev;
808 struct pnfs_block_dev *rv = NULL; 804 struct pnfs_block_dev *rv;
809 u32 max_resp_sz; 805 u32 max_resp_sz;
810 int max_pages; 806 int max_pages;
811 struct page **pages = NULL; 807 struct page **pages = NULL;
@@ -823,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
823 dev = kmalloc(sizeof(*dev), GFP_NOFS); 819 dev = kmalloc(sizeof(*dev), GFP_NOFS);
824 if (!dev) { 820 if (!dev) {
825 dprintk("%s kmalloc failed\n", __func__); 821 dprintk("%s kmalloc failed\n", __func__);
826 return NULL; 822 return ERR_PTR(-ENOMEM);
827 } 823 }
828 824
829 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 825 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
830 if (pages == NULL) { 826 if (pages == NULL) {
831 kfree(dev); 827 kfree(dev);
832 return NULL; 828 return ERR_PTR(-ENOMEM);
833 } 829 }
834 for (i = 0; i < max_pages; i++) { 830 for (i = 0; i < max_pages; i++) {
835 pages[i] = alloc_page(GFP_NOFS); 831 pages[i] = alloc_page(GFP_NOFS);
836 if (!pages[i]) 832 if (!pages[i]) {
833 rv = ERR_PTR(-ENOMEM);
837 goto out_free; 834 goto out_free;
835 }
838 } 836 }
839 837
840 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 838 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -847,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
847 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 845 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
848 rc = nfs4_proc_getdeviceinfo(server, dev); 846 rc = nfs4_proc_getdeviceinfo(server, dev);
849 dprintk("%s getdevice info returns %d\n", __func__, rc); 847 dprintk("%s getdevice info returns %d\n", __func__, rc);
850 if (rc) 848 if (rc) {
849 rv = ERR_PTR(rc);
851 goto out_free; 850 goto out_free;
851 }
852 852
853 rv = nfs4_blk_decode_device(server, dev); 853 rv = nfs4_blk_decode_device(server, dev);
854 out_free: 854 out_free:
@@ -866,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
866 struct pnfs_devicelist *dlist = NULL; 866 struct pnfs_devicelist *dlist = NULL;
867 struct pnfs_block_dev *bdev; 867 struct pnfs_block_dev *bdev;
868 LIST_HEAD(block_disklist); 868 LIST_HEAD(block_disklist);
869 int status = 0, i; 869 int status, i;
870 870
871 dprintk("%s enter\n", __func__); 871 dprintk("%s enter\n", __func__);
872 872
@@ -898,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
898 for (i = 0; i < dlist->num_devs; i++) { 898 for (i = 0; i < dlist->num_devs; i++) {
899 bdev = nfs4_blk_get_deviceinfo(server, fh, 899 bdev = nfs4_blk_get_deviceinfo(server, fh,
900 &dlist->dev_id[i]); 900 &dlist->dev_id[i]);
901 if (!bdev) { 901 if (IS_ERR(bdev)) {
902 status = -ENODEV; 902 status = PTR_ERR(bdev);
903 goto out_error; 903 goto out_error;
904 } 904 }
905 spin_lock(&b_mt_id->bm_lock); 905 spin_lock(&b_mt_id->bm_lock);
@@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
960}; 960};
961 961
962static const struct rpc_pipe_ops bl_upcall_ops = { 962static const struct rpc_pipe_ops bl_upcall_ops = {
963 .upcall = bl_pipe_upcall, 963 .upcall = rpc_pipe_generic_upcall,
964 .downcall = bl_pipe_downcall, 964 .downcall = bl_pipe_downcall,
965 .destroy_msg = bl_pipe_destroy_msg, 965 .destroy_msg = bl_pipe_destroy_msg,
966}; 966};
@@ -989,17 +989,20 @@ static int __init nfs4blocklayout_init(void)
989 mnt, 989 mnt,
990 NFS_PIPE_DIRNAME, 0, &path); 990 NFS_PIPE_DIRNAME, 0, &path);
991 if (ret) 991 if (ret)
992 goto out_remove; 992 goto out_putrpc;
993 993
994 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, 994 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
995 &bl_upcall_ops, 0); 995 &bl_upcall_ops, 0);
996 path_put(&path);
996 if (IS_ERR(bl_device_pipe)) { 997 if (IS_ERR(bl_device_pipe)) {
997 ret = PTR_ERR(bl_device_pipe); 998 ret = PTR_ERR(bl_device_pipe);
998 goto out_remove; 999 goto out_putrpc;
999 } 1000 }
1000out: 1001out:
1001 return ret; 1002 return ret;
1002 1003
1004out_putrpc:
1005 rpc_put_mount();
1003out_remove: 1006out_remove:
1004 pnfs_unregister_layoutdriver(&blocklayout_type); 1007 pnfs_unregister_layoutdriver(&blocklayout_type);
1005 return ret; 1008 return ret;
@@ -1012,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void)
1012 1015
1013 pnfs_unregister_layoutdriver(&blocklayout_type); 1016 pnfs_unregister_layoutdriver(&blocklayout_type);
1014 rpc_unlink(bl_device_pipe); 1017 rpc_unlink(bl_device_pipe);
1018 rpc_put_mount();
1015} 1019}
1016 1020
1017MODULE_ALIAS("nfs-layouttype4-3"); 1021MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a..42acf7ef599 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
150} 150}
151 151
152struct bl_dev_msg { 152struct bl_dev_msg {
153 int status; 153 int32_t status;
154 uint32_t major, minor; 154 uint32_t major, minor;
155}; 155};
156 156
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
170 170
171/* blocklayoutdev.c */ 171/* blocklayoutdev.c */
172ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
173 char __user *, size_t);
174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 172ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
175void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 173void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
176struct block_device *nfs4_blkdev_get(dev_t dev); 174struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01..d08ba9107fd 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
79 return blkdev_put(bdev, FMODE_READ); 79 return blkdev_put(bdev, FMODE_READ);
80} 80}
81 81
82/*
83 * Shouldn't there be a rpc_generic_upcall() to do this for us?
84 */
85ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
86 char __user *dst, size_t buflen)
87{
88 char *data = (char *)msg->data + msg->copied;
89 size_t mlen = min(msg->len - msg->copied, buflen);
90 unsigned long left;
91
92 left = copy_to_user(dst, data, mlen);
93 if (left == mlen) {
94 msg->errno = -EFAULT;
95 return -EFAULT;
96 }
97
98 mlen -= left;
99 msg->copied += mlen;
100 msg->errno = 0;
101 return mlen;
102}
103
104static struct bl_dev_msg bl_mount_reply; 82static struct bl_dev_msg bl_mount_reply;
105 83
106ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, 84ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
@@ -131,7 +109,7 @@ struct pnfs_block_dev *
131nfs4_blk_decode_device(struct nfs_server *server, 109nfs4_blk_decode_device(struct nfs_server *server,
132 struct pnfs_device *dev) 110 struct pnfs_device *dev)
133{ 111{
134 struct pnfs_block_dev *rv = NULL; 112 struct pnfs_block_dev *rv;
135 struct block_device *bd = NULL; 113 struct block_device *bd = NULL;
136 struct rpc_pipe_msg msg; 114 struct rpc_pipe_msg msg;
137 struct bl_msg_hdr bl_msg = { 115 struct bl_msg_hdr bl_msg = {
@@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
141 uint8_t *dataptr; 119 uint8_t *dataptr;
142 DECLARE_WAITQUEUE(wq, current); 120 DECLARE_WAITQUEUE(wq, current);
143 struct bl_dev_msg *reply = &bl_mount_reply; 121 struct bl_dev_msg *reply = &bl_mount_reply;
144 int offset, len, i; 122 int offset, len, i, rc;
145 123
146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 124 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, 125 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
168 146
169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__); 147 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
170 add_wait_queue(&bl_wq, &wq); 148 add_wait_queue(&bl_wq, &wq);
171 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 149 rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
150 if (rc < 0) {
172 remove_wait_queue(&bl_wq, &wq); 151 remove_wait_queue(&bl_wq, &wq);
152 rv = ERR_PTR(rc);
173 goto out; 153 goto out;
174 } 154 }
175 155
@@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
187 167
188 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 168 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
189 if (IS_ERR(bd)) { 169 if (IS_ERR(bd)) {
190 dprintk("%s failed to open device : %ld\n", 170 rc = PTR_ERR(bd);
191 __func__, PTR_ERR(bd)); 171 dprintk("%s failed to open device : %d\n", __func__, rc);
172 rv = ERR_PTR(rc);
192 goto out; 173 goto out;
193 } 174 }
194 175
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d29426905..516f3375e06 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
125 else 125 else
126 goto out_err; 126 goto out_err;
127 127
128 return svc_prepare_thread(serv, &serv->sv_pools[0]); 128 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
129 129
130out_err: 130out_err:
131 if (ret == 0) 131 if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
199 INIT_LIST_HEAD(&serv->sv_cb_list); 199 INIT_LIST_HEAD(&serv->sv_cb_list);
200 spin_lock_init(&serv->sv_cb_lock); 200 spin_lock_init(&serv->sv_cb_lock);
201 init_waitqueue_head(&serv->sv_cb_waitq); 201 init_waitqueue_head(&serv->sv_cb_waitq);
202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
203 if (IS_ERR(rqstp)) { 203 if (IS_ERR(rqstp)) {
204 svc_xprt_put(serv->sv_bc_xprt); 204 svc_xprt_put(serv->sv_bc_xprt);
205 serv->sv_bc_xprt = NULL; 205 serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afe..726e59a9e50 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
488 struct xdr_stream *xdr, 488 struct xdr_stream *xdr,
489 struct cb_recallanyargs *args) 489 struct cb_recallanyargs *args)
490{ 490{
491 __be32 *p; 491 uint32_t bitmap[2];
492 __be32 *p, status;
492 493
493 args->craa_addr = svc_addr(rqstp); 494 args->craa_addr = svc_addr(rqstp);
494 p = read_buf(xdr, 4); 495 p = read_buf(xdr, 4);
495 if (unlikely(p == NULL)) 496 if (unlikely(p == NULL))
496 return htonl(NFS4ERR_BADXDR); 497 return htonl(NFS4ERR_BADXDR);
497 args->craa_objs_to_keep = ntohl(*p++); 498 args->craa_objs_to_keep = ntohl(*p++);
498 p = read_buf(xdr, 4); 499 status = decode_bitmap(xdr, bitmap);
499 if (unlikely(p == NULL)) 500 if (unlikely(status))
500 return htonl(NFS4ERR_BADXDR); 501 return status;
501 args->craa_type_mask = ntohl(*p); 502 args->craa_type_mask = bitmap[0];
502 503
503 return 0; 504 return 0;
504} 505}
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
986 .vs_proc = nfs4_callback_procedures1, 987 .vs_proc = nfs4_callback_procedures1,
987 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 988 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
988 .vs_dispatch = NULL, 989 .vs_dispatch = NULL,
990 .vs_hidden = 1,
989}; 991};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b..873bf00d51a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
336 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; 336 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
337 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; 337 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
338 338
339 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && 339 if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
340 sin1->sin6_scope_id != sin2->sin6_scope_id)
341 return 0; 340 return 0;
341 else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
342 return sin1->sin6_scope_id == sin2->sin6_scope_id;
342 343
343 return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); 344 return 1;
344} 345}
345#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ 346#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
346static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, 347static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1867 /* display one transport per line on subsequent lines */ 1868 /* display one transport per line on subsequent lines */
1868 clp = list_entry(v, struct nfs_client, cl_share_link); 1869 clp = list_entry(v, struct nfs_client, cl_share_link);
1869 1870
1871 /* Check if the client is initialized */
1872 if (clp->cl_cons_state != NFS_CS_READY)
1873 return 0;
1874
1870 seq_printf(m, "v%u %s %s %3d %s\n", 1875 seq_printf(m, "v%u %s %s %3d %s\n",
1871 clp->rpc_ops->version, 1876 clp->rpc_ops->version,
1872 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1877 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc384..7f265406980 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
240 sizeof(delegation->stateid.data)); 240 sizeof(delegation->stateid.data));
241 delegation->type = res->delegation_type; 241 delegation->type = res->delegation_type;
242 delegation->maxsize = res->maxsize; 242 delegation->maxsize = res->maxsize;
243 delegation->change_attr = nfsi->change_attr; 243 delegation->change_attr = inode->i_version;
244 delegation->cred = get_rpccred(cred); 244 delegation->cred = get_rpccred(cred);
245 delegation->inode = inode; 245 delegation->inode = inode;
246 delegation->flags = 1<<NFS_DELEGATION_REFERENCED; 246 delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda..0a1f8312b4d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
137static int 137static int
138nfs_file_release(struct inode *inode, struct file *filp) 138nfs_file_release(struct inode *inode, struct file *filp)
139{ 139{
140 struct dentry *dentry = filp->f_path.dentry;
141
142 dprintk("NFS: release(%s/%s)\n", 140 dprintk("NFS: release(%s/%s)\n",
143 dentry->d_parent->d_name.name, 141 filp->f_path.dentry->d_parent->d_name.name,
144 dentry->d_name.name); 142 filp->f_path.dentry->d_name.name);
145 143
146 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 144 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
147 return nfs_release(inode, filp); 145 return nfs_release(inode, filp);
@@ -180,8 +178,6 @@ force_reval:
180 178
181static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 179static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
182{ 180{
183 loff_t loff;
184
185 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 181 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
186 filp->f_path.dentry->d_parent->d_name.name, 182 filp->f_path.dentry->d_parent->d_name.name,
187 filp->f_path.dentry->d_name.name, 183 filp->f_path.dentry->d_name.name,
@@ -197,13 +193,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
197 int retval = nfs_revalidate_file_size(inode, filp); 193 int retval = nfs_revalidate_file_size(inode, filp);
198 if (retval < 0) 194 if (retval < 0)
199 return (loff_t)retval; 195 return (loff_t)retval;
196 }
200 197
201 spin_lock(&inode->i_lock); 198 return generic_file_llseek(filp, offset, origin);
202 loff = generic_file_llseek_unlocked(filp, offset, origin);
203 spin_unlock(&inode->i_lock);
204 } else
205 loff = generic_file_llseek_unlocked(filp, offset, origin);
206 return loff;
207} 199}
208 200
209/* 201/*
@@ -234,14 +226,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
234 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 226 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
235 struct inode * inode = dentry->d_inode; 227 struct inode * inode = dentry->d_inode;
236 ssize_t result; 228 ssize_t result;
237 size_t count = iov_length(iov, nr_segs);
238 229
239 if (iocb->ki_filp->f_flags & O_DIRECT) 230 if (iocb->ki_filp->f_flags & O_DIRECT)
240 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 231 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
241 232
242 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 233 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
243 dentry->d_parent->d_name.name, dentry->d_name.name, 234 dentry->d_parent->d_name.name, dentry->d_name.name,
244 (unsigned long) count, (unsigned long) pos); 235 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
245 236
246 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 237 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
247 if (!result) { 238 if (!result) {
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc..7cf2c4699b0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
212 auxdata.ctime = nfsi->vfs_inode.i_ctime; 212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213 213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) 214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr; 215 auxdata.change_attr = nfsi->vfs_inode.i_version;
216 216
217 if (bufmax > sizeof(auxdata)) 217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata); 218 bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
244 auxdata.ctime = nfsi->vfs_inode.i_ctime; 244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245 245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) 246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr; 247 auxdata.change_attr = nfsi->vfs_inode.i_version;
248 248
249 if (memcmp(data, &auxdata, datalen) != 0) 249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE; 250 return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a1..47d1c6ff2d8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
336 struct idmap_hashtable idmap_group_hash; 336 struct idmap_hashtable idmap_group_hash;
337}; 337};
338 338
339static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
340 char __user *, size_t);
341static ssize_t idmap_pipe_downcall(struct file *, const char __user *, 339static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
342 size_t); 340 size_t);
343static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); 341static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
345static unsigned int fnvhash32(const void *, size_t); 343static unsigned int fnvhash32(const void *, size_t);
346 344
347static const struct rpc_pipe_ops idmap_upcall_ops = { 345static const struct rpc_pipe_ops idmap_upcall_ops = {
348 .upcall = idmap_pipe_upcall, 346 .upcall = rpc_pipe_generic_upcall,
349 .downcall = idmap_pipe_downcall, 347 .downcall = idmap_pipe_downcall,
350 .destroy_msg = idmap_pipe_destroy_msg, 348 .destroy_msg = idmap_pipe_destroy_msg,
351}; 349};
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
595 return ret; 593 return ret;
596} 594}
597 595
598/* RPC pipefs upcall/downcall routines */
599static ssize_t
600idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
601 char __user *dst, size_t buflen)
602{
603 char *data = (char *)msg->data + msg->copied;
604 size_t mlen = min(msg->len, buflen);
605 unsigned long left;
606
607 left = copy_to_user(dst, data, mlen);
608 if (left == mlen) {
609 msg->errno = -EFAULT;
610 return -EFAULT;
611 }
612
613 mlen -= left;
614 msg->copied += mlen;
615 msg->errno = 0;
616 return mlen;
617}
618
619static ssize_t 596static ssize_t
620idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) 597idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
621{ 598{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2..c07a55aec83 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -318,9 +318,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
318 memset(&inode->i_atime, 0, sizeof(inode->i_atime)); 318 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
319 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); 319 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
320 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); 320 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
321 nfsi->change_attr = 0; 321 inode->i_version = 0;
322 inode->i_size = 0; 322 inode->i_size = 0;
323 inode->i_nlink = 0; 323 clear_nlink(inode);
324 inode->i_uid = -2; 324 inode->i_uid = -2;
325 inode->i_gid = -2; 325 inode->i_gid = -2;
326 inode->i_blocks = 0; 326 inode->i_blocks = 0;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
344 | NFS_INO_INVALID_ACCESS 344 | NFS_INO_INVALID_ACCESS
345 | NFS_INO_INVALID_ACL; 345 | NFS_INO_INVALID_ACL;
346 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 346 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
347 nfsi->change_attr = fattr->change_attr; 347 inode->i_version = fattr->change_attr;
348 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) 348 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
350 | NFS_INO_INVALID_DATA; 350 | NFS_INO_INVALID_DATA;
@@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
355 | NFS_INO_INVALID_DATA 355 | NFS_INO_INVALID_DATA
356 | NFS_INO_REVAL_PAGECACHE; 356 | NFS_INO_REVAL_PAGECACHE;
357 if (fattr->valid & NFS_ATTR_FATTR_NLINK) 357 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
358 inode->i_nlink = fattr->nlink; 358 set_nlink(inode, fattr->nlink);
359 else if (nfs_server_capable(inode, NFS_CAP_NLINK)) 359 else if (nfs_server_capable(inode, NFS_CAP_NLINK))
360 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 360 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
361 if (fattr->valid & NFS_ATTR_FATTR_OWNER) 361 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
897 897
898 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 898 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
899 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 899 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
900 && nfsi->change_attr == fattr->pre_change_attr) { 900 && inode->i_version == fattr->pre_change_attr) {
901 nfsi->change_attr = fattr->change_attr; 901 inode->i_version = fattr->change_attr;
902 if (S_ISDIR(inode->i_mode)) 902 if (S_ISDIR(inode->i_mode))
903 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 903 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
904 ret |= NFS_INO_INVALID_ATTR; 904 ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
952 return -EIO; 952 return -EIO;
953 953
954 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 954 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
955 nfsi->change_attr != fattr->change_attr) 955 inode->i_version != fattr->change_attr)
956 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 956 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
957 957
958 /* Verify a few of the more important attributes */ 958 /* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1163 } 1163 }
1164 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 1164 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1165 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { 1165 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1166 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1166 fattr->pre_change_attr = inode->i_version;
1167 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; 1167 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1168 } 1168 }
1169 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && 1169 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1244 1244
1245 /* More cache consistency checks */ 1245 /* More cache consistency checks */
1246 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1246 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1247 if (nfsi->change_attr != fattr->change_attr) { 1247 if (inode->i_version != fattr->change_attr) {
1248 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1248 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1249 inode->i_sb->s_id, inode->i_ino); 1249 inode->i_sb->s_id, inode->i_ino);
1250 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1250 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1251 if (S_ISDIR(inode->i_mode)) 1251 if (S_ISDIR(inode->i_mode))
1252 nfs_force_lookup_revalidate(inode); 1252 nfs_force_lookup_revalidate(inode);
1253 nfsi->change_attr = fattr->change_attr; 1253 inode->i_version = fattr->change_attr;
1254 } 1254 }
1255 } else if (server->caps & NFS_CAP_CHANGE_ATTR) 1255 } else if (server->caps & NFS_CAP_CHANGE_ATTR)
1256 invalid |= save_cache_validity; 1256 invalid |= save_cache_validity;
@@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1361 invalid |= NFS_INO_INVALID_ATTR; 1361 invalid |= NFS_INO_INVALID_ATTR;
1362 if (S_ISDIR(inode->i_mode)) 1362 if (S_ISDIR(inode->i_mode))
1363 invalid |= NFS_INO_INVALID_DATA; 1363 invalid |= NFS_INO_INVALID_DATA;
1364 inode->i_nlink = fattr->nlink; 1364 set_nlink(inode, fattr->nlink);
1365 } 1365 }
1366 } else if (server->caps & NFS_CAP_NLINK) 1366 } else if (server->caps & NFS_CAP_NLINK)
1367 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR 1367 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd47..c1a1bd8ddf1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
457 PAGE_SIZE - 1) >> PAGE_SHIFT; 457 PAGE_SIZE - 1) >> PAGE_SHIFT;
458} 458}
459 459
460/*
461 * Helper for restarting RPC calls in the possible presence of NFSv4.1
462 * sessions.
463 */
464static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
465{
466 if (nfs4_has_session(clp))
467 return rpc_restart_call_prepare(task);
468 return rpc_restart_call(task);
469}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3e93e9a1bee..693ae22f873 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
13 13
14struct idmap; 14struct idmap;
15 15
16/*
17 * In a seqid-mutating op, this macro controls which error return
18 * values trigger incrementation of the seqid.
19 *
20 * from rfc 3010:
21 * The client MUST monotonically increment the sequence number for the
22 * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
23 * operations. This is true even in the event that the previous
24 * operation that used the sequence number received an error. The only
25 * exception to this rule is if the previous operation received one of
26 * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
27 * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
28 * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
29 *
30 */
31#define seqid_mutating_err(err) \
32(((err) != NFSERR_STALE_CLIENTID) && \
33 ((err) != NFSERR_STALE_STATEID) && \
34 ((err) != NFSERR_BAD_STATEID) && \
35 ((err) != NFSERR_BAD_SEQID) && \
36 ((err) != NFSERR_BAD_XDR) && \
37 ((err) != NFSERR_RESOURCE) && \
38 ((err) != NFSERR_NOFILEHANDLE))
39
40enum nfs4_client_state { 16enum nfs4_client_state {
41 NFS4CLNT_MANAGER_RUNNING = 0, 17 NFS4CLNT_MANAGER_RUNNING = 0,
42 NFS4CLNT_CHECK_LEASE, 18 NFS4CLNT_CHECK_LEASE,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840a..a62d36b9a99 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
31 31
32#include <linux/nfs_fs.h> 32#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 33#include <linux/nfs_page.h>
34#include <linux/module.h>
34 35
35#include "internal.h" 36#include "internal.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
@@ -77,19 +78,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
77 BUG(); 78 BUG();
78} 79}
79 80
80/* For data server errors we don't recover from */
81static void
82filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
83{
84 if (lseg->pls_range.iomode == IOMODE_RW) {
85 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
86 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
87 } else {
88 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
89 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
90 }
91}
92
93static int filelayout_async_handle_error(struct rpc_task *task, 81static int filelayout_async_handle_error(struct rpc_task *task,
94 struct nfs4_state *state, 82 struct nfs4_state *state,
95 struct nfs_client *clp, 83 struct nfs_client *clp,
@@ -135,7 +123,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
135static int filelayout_read_done_cb(struct rpc_task *task, 123static int filelayout_read_done_cb(struct rpc_task *task,
136 struct nfs_read_data *data) 124 struct nfs_read_data *data)
137{ 125{
138 struct nfs_client *clp = data->ds_clp;
139 int reset = 0; 126 int reset = 0;
140 127
141 dprintk("%s DS read\n", __func__); 128 dprintk("%s DS read\n", __func__);
@@ -145,11 +132,10 @@ static int filelayout_read_done_cb(struct rpc_task *task,
145 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 132 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
146 __func__, data->ds_clp, data->ds_clp->cl_session); 133 __func__, data->ds_clp, data->ds_clp->cl_session);
147 if (reset) { 134 if (reset) {
148 filelayout_set_lo_fail(data->lseg); 135 pnfs_set_lo_fail(data->lseg);
149 nfs4_reset_read(task, data); 136 nfs4_reset_read(task, data);
150 clp = NFS_SERVER(data->inode)->nfs_client;
151 } 137 }
152 nfs_restart_rpc(task, clp); 138 rpc_restart_call_prepare(task);
153 return -EAGAIN; 139 return -EAGAIN;
154 } 140 }
155 141
@@ -216,17 +202,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
216 202
217 if (filelayout_async_handle_error(task, data->args.context->state, 203 if (filelayout_async_handle_error(task, data->args.context->state,
218 data->ds_clp, &reset) == -EAGAIN) { 204 data->ds_clp, &reset) == -EAGAIN) {
219 struct nfs_client *clp;
220
221 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 205 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
222 __func__, data->ds_clp, data->ds_clp->cl_session); 206 __func__, data->ds_clp, data->ds_clp->cl_session);
223 if (reset) { 207 if (reset) {
224 filelayout_set_lo_fail(data->lseg); 208 pnfs_set_lo_fail(data->lseg);
225 nfs4_reset_write(task, data); 209 nfs4_reset_write(task, data);
226 clp = NFS_SERVER(data->inode)->nfs_client; 210 }
227 } else 211 rpc_restart_call_prepare(task);
228 clp = data->ds_clp;
229 nfs_restart_rpc(task, clp);
230 return -EAGAIN; 212 return -EAGAIN;
231 } 213 }
232 214
@@ -256,9 +238,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
256 __func__, data->ds_clp, data->ds_clp->cl_session); 238 __func__, data->ds_clp, data->ds_clp->cl_session);
257 if (reset) { 239 if (reset) {
258 prepare_to_resend_writes(data); 240 prepare_to_resend_writes(data);
259 filelayout_set_lo_fail(data->lseg); 241 pnfs_set_lo_fail(data->lseg);
260 } else 242 } else
261 nfs_restart_rpc(task, data->ds_clp); 243 rpc_restart_call_prepare(task);
262 return -EAGAIN; 244 return -EAGAIN;
263 } 245 }
264 246
@@ -468,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
468 450
469 fl->dsaddr = dsaddr; 451 fl->dsaddr = dsaddr;
470 452
471 if (fl->first_stripe_index < 0 || 453 if (fl->first_stripe_index >= dsaddr->stripe_count) {
472 fl->first_stripe_index >= dsaddr->stripe_count) { 454 dprintk("%s Bad first_stripe_index %u\n",
473 dprintk("%s Bad first_stripe_index %d\n",
474 __func__, fl->first_stripe_index); 455 __func__, fl->first_stripe_index);
475 goto out_put; 456 goto out_put;
476 } 457 }
@@ -571,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
571 552
572 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. 553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
573 * Futher checking is done in filelayout_check_layout */ 554 * Futher checking is done in filelayout_check_layout */
574 if (fl->num_fh < 0 || fl->num_fh > 555 if (fl->num_fh >
575 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) 556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
576 goto out_err; 557 goto out_err;
577 558
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4700fae1ada..b60fddf606f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
73static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 73static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
74static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 74static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
75static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 75static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
76static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
77 const struct qstr *name, struct nfs_fh *fhandle,
78 struct nfs_fattr *fattr);
79static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 76static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 77static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
81 struct nfs_fattr *fattr, struct iattr *sattr, 78 struct nfs_fattr *fattr, struct iattr *sattr,
@@ -753,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
753 750
754 spin_lock(&dir->i_lock); 751 spin_lock(&dir->i_lock);
755 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; 752 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
756 if (!cinfo->atomic || cinfo->before != nfsi->change_attr) 753 if (!cinfo->atomic || cinfo->before != dir->i_version)
757 nfs_force_lookup_revalidate(dir); 754 nfs_force_lookup_revalidate(dir);
758 nfsi->change_attr = cinfo->after; 755 dir->i_version = cinfo->after;
759 spin_unlock(&dir->i_lock); 756 spin_unlock(&dir->i_lock);
760} 757}
761 758
@@ -1596,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1596 int status; 1593 int status;
1597 1594
1598 status = nfs4_run_open_task(data, 0); 1595 status = nfs4_run_open_task(data, 0);
1599 if (status != 0 || !data->rpc_done) 1596 if (!data->rpc_done)
1597 return status;
1598 if (status != 0) {
1599 if (status == -NFS4ERR_BADNAME &&
1600 !(o_arg->open_flags & O_CREAT))
1601 return -ENOENT;
1600 return status; 1602 return status;
1603 }
1601 1604
1602 if (o_arg->open_flags & O_CREAT) { 1605 if (o_arg->open_flags & O_CREAT) {
1603 update_changeattr(dir, &o_res->cinfo); 1606 update_changeattr(dir, &o_res->cinfo);
@@ -2408,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2408 return status; 2411 return status;
2409} 2412}
2410 2413
2411static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, 2414static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2412 const struct nfs_fh *dirfh, const struct qstr *name, 2415 const struct qstr *name, struct nfs_fh *fhandle,
2413 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2416 struct nfs_fattr *fattr)
2414{ 2417{
2418 struct nfs_server *server = NFS_SERVER(dir);
2415 int status; 2419 int status;
2416 struct nfs4_lookup_arg args = { 2420 struct nfs4_lookup_arg args = {
2417 .bitmask = server->attr_bitmask, 2421 .bitmask = server->attr_bitmask,
2418 .dir_fh = dirfh, 2422 .dir_fh = NFS_FH(dir),
2419 .name = name, 2423 .name = name,
2420 }; 2424 };
2421 struct nfs4_lookup_res res = { 2425 struct nfs4_lookup_res res = {
@@ -2431,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
2431 2435
2432 nfs_fattr_init(fattr); 2436 nfs_fattr_init(fattr);
2433 2437
2434 dprintk("NFS call lookupfh %s\n", name->name);
2435 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2436 dprintk("NFS reply lookupfh: %d\n", status);
2437 return status;
2438}
2439
2440static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2441 struct qstr *name, struct nfs_fh *fhandle,
2442 struct nfs_fattr *fattr)
2443{
2444 struct nfs4_exception exception = { };
2445 int err;
2446 do {
2447 err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
2448 /* FIXME: !!!! */
2449 if (err == -NFS4ERR_MOVED) {
2450 err = -EREMOTE;
2451 break;
2452 }
2453 err = nfs4_handle_exception(server, err, &exception);
2454 } while (exception.retry);
2455 return err;
2456}
2457
2458static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2459 const struct qstr *name, struct nfs_fh *fhandle,
2460 struct nfs_fattr *fattr)
2461{
2462 int status;
2463
2464 dprintk("NFS call lookup %s\n", name->name); 2438 dprintk("NFS call lookup %s\n", name->name);
2465 status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); 2439 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2466 if (status == -NFS4ERR_MOVED)
2467 status = nfs4_get_referral(dir, name, fattr, fhandle);
2468 dprintk("NFS reply lookup: %d\n", status); 2440 dprintk("NFS reply lookup: %d\n", status);
2469 return status; 2441 return status;
2470} 2442}
@@ -2485,11 +2457,20 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
2485 struct nfs4_exception exception = { }; 2457 struct nfs4_exception exception = { };
2486 int err; 2458 int err;
2487 do { 2459 do {
2488 err = nfs4_handle_exception(NFS_SERVER(dir), 2460 int status;
2489 _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), 2461
2490 &exception); 2462 status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
2491 if (err == -EPERM) 2463 switch (status) {
2464 case -NFS4ERR_BADNAME:
2465 return -ENOENT;
2466 case -NFS4ERR_MOVED:
2467 err = nfs4_get_referral(dir, name, fattr, fhandle);
2468 break;
2469 case -NFS4ERR_WRONGSEC:
2492 nfs_fixup_secinfo_attributes(fattr, fhandle); 2470 nfs_fixup_secinfo_attributes(fattr, fhandle);
2471 }
2472 err = nfs4_handle_exception(NFS_SERVER(dir),
2473 status, &exception);
2493 } while (exception.retry); 2474 } while (exception.retry);
2494 return err; 2475 return err;
2495} 2476}
@@ -3210,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3210 struct nfs_server *server = NFS_SERVER(data->inode); 3191 struct nfs_server *server = NFS_SERVER(data->inode);
3211 3192
3212 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3193 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3213 nfs_restart_rpc(task, server->nfs_client); 3194 rpc_restart_call_prepare(task);
3214 return -EAGAIN; 3195 return -EAGAIN;
3215 } 3196 }
3216 3197
@@ -3260,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
3260 struct inode *inode = data->inode; 3241 struct inode *inode = data->inode;
3261 3242
3262 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3243 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3263 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3244 rpc_restart_call_prepare(task);
3264 return -EAGAIN; 3245 return -EAGAIN;
3265 } 3246 }
3266 if (task->tk_status >= 0) { 3247 if (task->tk_status >= 0) {
@@ -3317,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
3317 struct inode *inode = data->inode; 3298 struct inode *inode = data->inode;
3318 3299
3319 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3300 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3320 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3301 rpc_restart_call_prepare(task);
3321 return -EAGAIN; 3302 return -EAGAIN;
3322 } 3303 }
3323 nfs_refresh_inode(inode, data->res.fattr); 3304 nfs_refresh_inode(inode, data->res.fattr);
@@ -3857,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3857 default: 3838 default:
3858 if (nfs4_async_handle_error(task, data->res.server, NULL) == 3839 if (nfs4_async_handle_error(task, data->res.server, NULL) ==
3859 -EAGAIN) { 3840 -EAGAIN) {
3860 nfs_restart_rpc(task, data->res.server->nfs_client); 3841 rpc_restart_call_prepare(task);
3861 return; 3842 return;
3862 } 3843 }
3863 } 3844 }
@@ -4111,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
4111 break; 4092 break;
4112 default: 4093 default:
4113 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 4094 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
4114 nfs_restart_rpc(task, 4095 rpc_restart_call_prepare(task);
4115 calldata->server->nfs_client);
4116 } 4096 }
4117} 4097}
4118 4098
@@ -4945,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4945 task->tk_status = 0; 4925 task->tk_status = 0;
4946 /* fall through */ 4926 /* fall through */
4947 case -NFS4ERR_RETRY_UNCACHED_REP: 4927 case -NFS4ERR_RETRY_UNCACHED_REP:
4948 nfs_restart_rpc(task, data->clp); 4928 rpc_restart_call_prepare(task);
4949 return; 4929 return;
4950 } 4930 }
4951 dprintk("<-- %s\n", __func__); 4931 dprintk("<-- %s\n", __func__);
@@ -5786,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5786 5766
5787 server = NFS_SERVER(lrp->args.inode); 5767 server = NFS_SERVER(lrp->args.inode);
5788 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 5768 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5789 nfs_restart_rpc(task, lrp->clp); 5769 rpc_restart_call_prepare(task);
5790 return; 5770 return;
5791 } 5771 }
5792 spin_lock(&lo->plh_inode->i_lock); 5772 spin_lock(&lo->plh_inode->i_lock);
@@ -5957,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5957 } 5937 }
5958 5938
5959 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 5939 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5960 nfs_restart_rpc(task, server->nfs_client); 5940 rpc_restart_call_prepare(task);
5961 return; 5941 return;
5962 } 5942 }
5963 5943
@@ -5970,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata)
5970{ 5950{
5971 struct nfs4_layoutcommit_data *data = calldata; 5951 struct nfs4_layoutcommit_data *data = calldata;
5972 struct pnfs_layout_segment *lseg, *tmp; 5952 struct pnfs_layout_segment *lseg, *tmp;
5953 unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
5973 5954
5974 pnfs_cleanup_layoutcommit(data); 5955 pnfs_cleanup_layoutcommit(data);
5975 /* Matched by references in pnfs_set_layoutcommit */ 5956 /* Matched by references in pnfs_set_layoutcommit */
@@ -5979,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata)
5979 &lseg->pls_flags)) 5960 &lseg->pls_flags))
5980 put_lseg(lseg); 5961 put_lseg(lseg);
5981 } 5962 }
5963
5964 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
5965 smp_mb__after_clear_bit();
5966 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
5967
5982 put_rpccred(data->cred); 5968 put_rpccred(data->cred);
5983 kfree(data); 5969 kfree(data);
5984} 5970}
@@ -6270,7 +6256,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6270 .getroot = nfs4_proc_get_root, 6256 .getroot = nfs4_proc_get_root,
6271 .getattr = nfs4_proc_getattr, 6257 .getattr = nfs4_proc_getattr,
6272 .setattr = nfs4_proc_setattr, 6258 .setattr = nfs4_proc_setattr,
6273 .lookupfh = nfs4_proc_lookupfh,
6274 .lookup = nfs4_proc_lookup, 6259 .lookup = nfs4_proc_lookup,
6275 .access = nfs4_proc_access, 6260 .access = nfs4_proc_access,
6276 .readlink = nfs4_proc_readlink, 6261 .readlink = nfs4_proc_readlink,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4..e6161b213ed 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6602 if (status) 6602 if (status)
6603 goto out; 6603 goto out;
6604 status = decode_secinfo(xdr, res); 6604 status = decode_secinfo(xdr, res);
6605 if (status)
6606 goto out;
6607out: 6605out:
6608 return status; 6606 return status;
6609} 6607}
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc..c807ab93140 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
38 */ 38 */
39 39
40#include <linux/module.h> 40#include <linux/module.h>
41#include <scsi/osd_initiator.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44 44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent { 47struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node; 48 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od; 49 struct ore_dev od;
56}; 50};
57 51
58static void 52static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{ 54{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); 55 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62 56
63 dprintk("%s: free od=%p\n", __func__, de->od); 57 dprintk("%s: free od=%p\n", __func__, de->od.od);
64 osduld_put_device(de->od); 58 osduld_put_device(de->od.od);
65 kfree(de); 59 kfree(de);
66} 60}
67 61
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
98 nfss->pnfs_curr_ld, 92 nfss->pnfs_curr_ld,
99 nfss->nfs_client, 93 nfss->nfs_client,
100 d_id); 94 d_id);
101 de->od = od; 95 de->od.od = od;
102 96
103 d = nfs4_insert_deviceid_node(&de->id_node); 97 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node); 98 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) { 99 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od); 100 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
107 objio_free_deviceid_node(&de->id_node); 101 objio_free_deviceid_node(&de->id_node);
108 de = n; 102 de = n;
109 } 103 }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
111 return de; 105 return de;
112} 106}
113 107
114struct caps_buffers {
115 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
116 u8 creds[OSD_CAP_LEN];
117};
118
119struct objio_segment { 108struct objio_segment {
120 struct pnfs_layout_segment lseg; 109 struct pnfs_layout_segment lseg;
121 110
122 struct pnfs_osd_object_cred *comps; 111 struct ore_layout layout;
123 112 struct ore_components oc;
124 unsigned mirrors_p1;
125 unsigned stripe_unit;
126 unsigned group_width; /* Data stripe_units without integrity comps */
127 u64 group_depth;
128 unsigned group_count;
129
130 unsigned max_io_size;
131
132 unsigned comps_index;
133 unsigned num_comps;
134 /* variable length */
135 struct objio_dev_ent *ods[];
136}; 113};
137 114
138static inline struct objio_segment * 115static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141 return container_of(lseg, struct objio_segment, lseg); 118 return container_of(lseg, struct objio_segment, lseg);
142} 119}
143 120
144struct objio_state;
145typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
146
147struct objio_state { 121struct objio_state {
148 /* Generic layer */ 122 /* Generic layer */
149 struct objlayout_io_state ol_state; 123 struct objlayout_io_res oir;
150 124
151 struct objio_segment *layout; 125 bool sync;
152 126 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
153 struct kref kref; 127 struct ore_io_state *ios;
154 objio_done_fn done;
155 void *private;
156
157 unsigned long length;
158 unsigned numdevs; /* Actually used devs in this IO */
159 /* A per-device variable array of size numdevs */
160 struct _objio_per_comp {
161 struct bio *bio;
162 struct osd_request *or;
163 unsigned long length;
164 u64 offset;
165 unsigned dev;
166 } per_dev[];
167}; 128};
168 129
169/* Send and wait for a get_device_info of devices in the layout, 130/* Send and wait for a get_device_info of devices in the layout,
170 then look them up with the osd_initiator library */ 131 then look them up with the osd_initiator library */
171static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, 132static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
172 struct objio_segment *objio_seg, unsigned comp, 133 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
173 gfp_t gfp_flags) 134 gfp_t gfp_flags)
174{ 135{
175 struct pnfs_osd_deviceaddr *deviceaddr; 136 struct pnfs_osd_deviceaddr *deviceaddr;
176 struct nfs4_deviceid *d_id;
177 struct objio_dev_ent *ode; 137 struct objio_dev_ent *ode;
178 struct osd_dev *od; 138 struct osd_dev *od;
179 struct osd_dev_info odi; 139 struct osd_dev_info odi;
180 int err; 140 int err;
181 141
182 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
183
184 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 142 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
185 if (ode) 143 if (ode) {
186 return ode; 144 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
145 return 0;
146 }
187 147
188 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 148 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
189 if (unlikely(err)) { 149 if (unlikely(err)) {
190 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", 150 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
191 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); 151 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
192 return ERR_PTR(err); 152 return err;
193 } 153 }
194 154
195 odi.systemid_len = deviceaddr->oda_systemid.len; 155 odi.systemid_len = deviceaddr->oda_systemid.len;
196 if (odi.systemid_len > sizeof(odi.systemid)) { 156 if (odi.systemid_len > sizeof(odi.systemid)) {
157 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
158 __func__, sizeof(odi.systemid));
197 err = -EINVAL; 159 err = -EINVAL;
198 goto out; 160 goto out;
199 } else if (odi.systemid_len) 161 } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
218 180
219 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, 181 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
220 gfp_flags); 182 gfp_flags);
221 183 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
184 dprintk("Adding new dev_id(%llx:%llx)\n",
185 _DEVID_LO(d_id), _DEVID_HI(d_id));
222out: 186out:
223 dprintk("%s: return=%d\n", __func__, err);
224 objlayout_put_deviceinfo(deviceaddr); 187 objlayout_put_deviceinfo(deviceaddr);
225 return err ? ERR_PTR(err) : ode; 188 return err;
226} 189}
227 190
228static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 191static void copy_single_comp(struct ore_components *oc, unsigned c,
229 struct objio_segment *objio_seg, 192 struct pnfs_osd_object_cred *src_comp)
230 gfp_t gfp_flags)
231{ 193{
232 unsigned i; 194 struct ore_comp *ocomp = &oc->comps[c];
233 int err;
234 195
235 /* lookup all devices */ 196 WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
236 for (i = 0; i < objio_seg->num_comps; i++) { 197 WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
237 struct objio_dev_ent *ode;
238 198
239 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); 199 ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
240 if (unlikely(IS_ERR(ode))) { 200 ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
241 err = PTR_ERR(ode);
242 goto out;
243 }
244 objio_seg->ods[i] = ode;
245 }
246 err = 0;
247 201
248out: 202 memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
249 dprintk("%s: return=%d\n", __func__, err);
250 return err;
251} 203}
252 204
253static int _verify_data_map(struct pnfs_osd_layout *layout) 205int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
206 struct objio_segment **pseg)
254{ 207{
255 struct pnfs_osd_data_map *data_map = &layout->olo_map; 208 struct __alloc_objio_segment {
256 u64 stripe_length; 209 struct objio_segment olseg;
257 u32 group_width; 210 struct ore_dev *ods[numdevs];
258 211 struct ore_comp comps[numdevs];
259/* FIXME: Only raid0 for now. if not go through MDS */ 212 } *aolseg;
260 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
261 printk(KERN_ERR "Only RAID_0 for now\n");
262 return -ENOTSUPP;
263 }
264 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
265 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
266 data_map->odm_num_comps, data_map->odm_mirror_cnt);
267 return -EINVAL;
268 }
269 213
270 if (data_map->odm_group_width) 214 aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
271 group_width = data_map->odm_group_width; 215 if (unlikely(!aolseg)) {
272 else 216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
273 group_width = data_map->odm_num_comps / 217 numdevs, sizeof(*aolseg));
274 (data_map->odm_mirror_cnt + 1); 218 return -ENOMEM;
275
276 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
277 if (stripe_length >= (1ULL << 32)) {
278 printk(KERN_ERR "Total Stripe length(0x%llx)"
279 " >= 32bit is not supported\n", _LLU(stripe_length));
280 return -ENOTSUPP;
281 } 219 }
282 220
283 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { 221 aolseg->olseg.oc.numdevs = numdevs;
284 printk(KERN_ERR "Stripe Unit(0x%llx)" 222 aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
285 " must be Multples of PAGE_SIZE(0x%lx)\n", 223 aolseg->olseg.oc.comps = aolseg->comps;
286 _LLU(data_map->odm_stripe_unit), PAGE_SIZE); 224 aolseg->olseg.oc.ods = aolseg->ods;
287 return -ENOTSUPP;
288 }
289 225
226 *pseg = &aolseg->olseg;
290 return 0; 227 return 0;
291} 228}
292 229
293static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
294 struct pnfs_osd_object_cred *src_comp,
295 struct caps_buffers *caps_p)
296{
297 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
298 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
299
300 *cur_comp = *src_comp;
301
302 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
303 sizeof(caps_p->caps_key));
304 cur_comp->oc_cap_key.cred = caps_p->caps_key;
305
306 memcpy(caps_p->creds, src_comp->oc_cap.cred,
307 sizeof(caps_p->creds));
308 cur_comp->oc_cap.cred = caps_p->creds;
309}
310
311int objio_alloc_lseg(struct pnfs_layout_segment **outp, 230int objio_alloc_lseg(struct pnfs_layout_segment **outp,
312 struct pnfs_layout_hdr *pnfslay, 231 struct pnfs_layout_hdr *pnfslay,
313 struct pnfs_layout_range *range, 232 struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
317 struct objio_segment *objio_seg; 236 struct objio_segment *objio_seg;
318 struct pnfs_osd_xdr_decode_layout_iter iter; 237 struct pnfs_osd_xdr_decode_layout_iter iter;
319 struct pnfs_osd_layout layout; 238 struct pnfs_osd_layout layout;
320 struct pnfs_osd_object_cred *cur_comp, src_comp; 239 struct pnfs_osd_object_cred src_comp;
321 struct caps_buffers *caps_p; 240 unsigned cur_comp;
322 int err; 241 int err;
323 242
324 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); 243 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
325 if (unlikely(err)) 244 if (unlikely(err))
326 return err; 245 return err;
327 246
328 err = _verify_data_map(&layout); 247 err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
329 if (unlikely(err)) 248 if (unlikely(err))
330 return err; 249 return err;
331 250
332 objio_seg = kzalloc(sizeof(*objio_seg) + 251 objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
333 sizeof(objio_seg->ods[0]) * layout.olo_num_comps + 252 objio_seg->layout.group_width = layout.olo_map.odm_group_width;
334 sizeof(*objio_seg->comps) * layout.olo_num_comps + 253 objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
335 sizeof(struct caps_buffers) * layout.olo_num_comps, 254 objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
336 gfp_flags); 255 objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
337 if (!objio_seg)
338 return -ENOMEM;
339 256
340 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); 257 err = ore_verify_layout(layout.olo_map.odm_num_comps,
341 cur_comp = objio_seg->comps; 258 &objio_seg->layout);
342 caps_p = (void *)(cur_comp + layout.olo_num_comps);
343 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
344 copy_single_comp(cur_comp++, &src_comp, caps_p++);
345 if (unlikely(err)) 259 if (unlikely(err))
346 goto err; 260 goto err;
347 261
348 objio_seg->num_comps = layout.olo_num_comps; 262 objio_seg->oc.first_dev = layout.olo_comps_index;
349 objio_seg->comps_index = layout.olo_comps_index; 263 cur_comp = 0;
350 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); 264 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
351 if (err) 265 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
352 goto err; 266 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
353 267 &src_comp.oc_object_id.oid_device_id,
354 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; 268 gfp_flags);
355 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; 269 if (err)
356 if (layout.olo_map.odm_group_width) { 270 goto err;
357 objio_seg->group_width = layout.olo_map.odm_group_width; 271 ++cur_comp;
358 objio_seg->group_depth = layout.olo_map.odm_group_depth;
359 objio_seg->group_count = layout.olo_map.odm_num_comps /
360 objio_seg->mirrors_p1 /
361 objio_seg->group_width;
362 } else {
363 objio_seg->group_width = layout.olo_map.odm_num_comps /
364 objio_seg->mirrors_p1;
365 objio_seg->group_depth = -1;
366 objio_seg->group_count = 1;
367 } 272 }
368 273 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
369 /* Cache this calculation it will hit for every page */ 274 if (unlikely(err))
370 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - 275 goto err;
371 objio_seg->stripe_unit) *
372 objio_seg->group_width;
373 276
374 *outp = &objio_seg->lseg; 277 *outp = &objio_seg->lseg;
375 return 0; 278 return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
386 int i; 289 int i;
387 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 290 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
388 291
389 for (i = 0; i < objio_seg->num_comps; i++) { 292 for (i = 0; i < objio_seg->oc.numdevs; i++) {
390 if (!objio_seg->ods[i]) 293 struct ore_dev *od = objio_seg->oc.ods[i];
294 struct objio_dev_ent *ode;
295
296 if (!od)
391 break; 297 break;
392 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); 298 ode = container_of(od, typeof(*ode), od);
299 nfs4_put_deviceid_node(&ode->id_node);
393 } 300 }
394 kfree(objio_seg); 301 kfree(objio_seg);
395} 302}
396 303
397int objio_alloc_io_state(struct pnfs_layout_segment *lseg, 304static int
398 struct objlayout_io_state **outp, 305objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
399 gfp_t gfp_flags) 306 struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
307 loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
308 struct objio_state **outp)
400{ 309{
401 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 310 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
402 struct objio_state *ios; 311 struct ore_io_state *ios;
403 const unsigned first_size = sizeof(*ios) + 312 int ret;
404 objio_seg->num_comps * sizeof(ios->per_dev[0]); 313 struct __alloc_objio_state {
405 const unsigned sec_size = objio_seg->num_comps * 314 struct objio_state objios;
406 sizeof(ios->ol_state.ioerrs[0]); 315 struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
407 316 } *aos;
408 ios = kzalloc(first_size + sec_size, gfp_flags); 317
409 if (unlikely(!ios)) 318 aos = kzalloc(sizeof(*aos), gfp_flags);
319 if (unlikely(!aos))
410 return -ENOMEM; 320 return -ENOMEM;
411 321
412 ios->layout = objio_seg; 322 objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
413 ios->ol_state.ioerrs = ((void *)ios) + first_size; 323 aos->ioerrs, rpcdata, pnfs_layout_type);
414 ios->ol_state.num_comps = objio_seg->num_comps;
415 324
416 *outp = &ios->ol_state; 325 ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
326 offset, count, &ios);
327 if (unlikely(ret)) {
328 kfree(aos);
329 return ret;
330 }
331
332 ios->pages = pages;
333 ios->pgbase = pgbase;
334 ios->private = aos;
335 BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
336
337 aos->objios.sync = 0;
338 aos->objios.ios = ios;
339 *outp = &aos->objios;
417 return 0; 340 return 0;
418} 341}
419 342
420void objio_free_io_state(struct objlayout_io_state *ol_state) 343void objio_free_result(struct objlayout_io_res *oir)
421{ 344{
422 struct objio_state *ios = container_of(ol_state, struct objio_state, 345 struct objio_state *objios = container_of(oir, struct objio_state, oir);
423 ol_state);
424 346
425 kfree(ios); 347 ore_put_io_state(objios->ios);
348 kfree(objios);
426} 349}
427 350
428enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) 351enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
455 } 378 }
456} 379}
457 380
458static void _clear_bio(struct bio *bio) 381static void __on_dev_error(struct ore_io_state *ios,
382 struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
383 u64 dev_offset, u64 dev_len)
459{ 384{
460 struct bio_vec *bv; 385 struct objio_state *objios = ios->private;
461 unsigned i; 386 struct pnfs_osd_objid pooid;
462 387 struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
463 __bio_for_each_segment(bv, bio, i, 0) { 388 /* FIXME: what to do with more-then-one-group layouts. We need to
464 unsigned this_count = bv->bv_len; 389 * translate from ore_io_state index to oc->comps index
465 390 */
466 if (likely(PAGE_SIZE == this_count)) 391 unsigned comp = dev_index;
467 clear_highpage(bv->bv_page);
468 else
469 zero_user(bv->bv_page, bv->bv_offset, this_count);
470 }
471}
472
473static int _io_check(struct objio_state *ios, bool is_write)
474{
475 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
476 int lin_ret = 0;
477 int i;
478
479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or;
482 int ret;
483
484 if (!or)
485 continue;
486 392
487 ret = osd_req_decode_sense(or, &osi); 393 pooid.oid_device_id = ode->id_node.deviceid;
488 if (likely(!ret)) 394 pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
489 continue; 395 pooid.oid_object_id = ios->oc->comps[comp].obj.id;
490 396
491 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 397 objlayout_io_set_result(&objios->oir, comp,
492 /* start read offset passed endof file */ 398 &pooid, osd_pri_2_pnfs_err(oep),
493 BUG_ON(is_write); 399 dev_offset, dev_len, !ios->reading);
494 _clear_bio(ios->per_dev[i].bio);
495 dprintk("%s: start read offset passed end of file "
496 "offset=0x%llx, length=0x%lx\n", __func__,
497 _LLU(ios->per_dev[i].offset),
498 ios->per_dev[i].length);
499
500 continue; /* we recovered */
501 }
502 objlayout_io_set_result(&ios->ol_state, i,
503 &ios->layout->comps[i].oc_object_id,
504 osd_pri_2_pnfs_err(osi.osd_err_pri),
505 ios->per_dev[i].offset,
506 ios->per_dev[i].length,
507 is_write);
508
509 if (osi.osd_err_pri >= oep) {
510 oep = osi.osd_err_pri;
511 lin_ret = ret;
512 }
513 }
514
515 return lin_ret;
516}
517
518/*
519 * Common IO state helpers.
520 */
521static void _io_free(struct objio_state *ios)
522{
523 unsigned i;
524
525 for (i = 0; i < ios->numdevs; i++) {
526 struct _objio_per_comp *per_dev = &ios->per_dev[i];
527
528 if (per_dev->or) {
529 osd_end_request(per_dev->or);
530 per_dev->or = NULL;
531 }
532
533 if (per_dev->bio) {
534 bio_put(per_dev->bio);
535 per_dev->bio = NULL;
536 }
537 }
538}
539
540struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
541{
542 unsigned min_dev = ios->layout->comps_index;
543 unsigned max_dev = min_dev + ios->layout->num_comps;
544
545 BUG_ON(dev < min_dev || max_dev <= dev);
546 return ios->layout->ods[dev - min_dev]->od;
547}
548
549struct _striping_info {
550 u64 obj_offset;
551 u64 group_length;
552 unsigned dev;
553 unsigned unit_off;
554};
555
556static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
557 struct _striping_info *si)
558{
559 u32 stripe_unit = ios->layout->stripe_unit;
560 u32 group_width = ios->layout->group_width;
561 u64 group_depth = ios->layout->group_depth;
562 u32 U = stripe_unit * group_width;
563
564 u64 T = U * group_depth;
565 u64 S = T * ios->layout->group_count;
566 u64 M = div64_u64(file_offset, S);
567
568 /*
569 G = (L - (M * S)) / T
570 H = (L - (M * S)) % T
571 */
572 u64 LmodU = file_offset - M * S;
573 u32 G = div64_u64(LmodU, T);
574 u64 H = LmodU - G * T;
575
576 u32 N = div_u64(H, U);
577
578 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
579 si->obj_offset = si->unit_off + (N * stripe_unit) +
580 (M * group_depth * stripe_unit);
581
582 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
583 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
584 si->dev *= ios->layout->mirrors_p1;
585
586 si->group_length = T - H;
587}
588
589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
591 gfp_t gfp_flags)
592{
593 unsigned pg = *cur_pg;
594 int cur_len = len;
595 struct request_queue *q =
596 osd_request_queue(_io_od(ios, per_dev->dev));
597
598 if (per_dev->bio == NULL) {
599 unsigned pages_in_stripe = ios->layout->group_width *
600 (ios->layout->stripe_unit / PAGE_SIZE);
601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
602 ios->layout->group_width;
603
604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
605 bio_size = BIO_MAX_PAGES_KMALLOC;
606
607 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
608 if (unlikely(!per_dev->bio)) {
609 dprintk("Faild to allocate BIO size=%u\n", bio_size);
610 return -ENOMEM;
611 }
612 }
613
614 while (cur_len > 0) {
615 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
616 unsigned added_len;
617
618 BUG_ON(ios->ol_state.nr_pages <= pg);
619 cur_len -= pglen;
620
621 added_len = bio_add_pc_page(q, per_dev->bio,
622 ios->ol_state.pages[pg], pglen, pgbase);
623 if (unlikely(pglen != added_len))
624 return -ENOMEM;
625 pgbase = 0;
626 ++pg;
627 }
628 BUG_ON(cur_len);
629
630 per_dev->length += len;
631 *cur_pg = pg;
632 return 0;
633}
634
635static int _prepare_one_group(struct objio_state *ios, u64 length,
636 struct _striping_info *si, unsigned *last_pg,
637 gfp_t gfp_flags)
638{
639 unsigned stripe_unit = ios->layout->stripe_unit;
640 unsigned mirrors_p1 = ios->layout->mirrors_p1;
641 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
642 unsigned dev = si->dev;
643 unsigned first_dev = dev - (dev % devs_in_group);
644 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
645 unsigned cur_pg = *last_pg;
646 int ret = 0;
647
648 while (length) {
649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
650 unsigned cur_len, page_off = 0;
651
652 if (!per_dev->length) {
653 per_dev->dev = dev;
654 if (dev < si->dev) {
655 per_dev->offset = si->obj_offset + stripe_unit -
656 si->unit_off;
657 cur_len = stripe_unit;
658 } else if (dev == si->dev) {
659 per_dev->offset = si->obj_offset;
660 cur_len = stripe_unit - si->unit_off;
661 page_off = si->unit_off & ~PAGE_MASK;
662 BUG_ON(page_off &&
663 (page_off != ios->ol_state.pgbase));
664 } else { /* dev > si->dev */
665 per_dev->offset = si->obj_offset - si->unit_off;
666 cur_len = stripe_unit;
667 }
668
669 if (max_comp < dev - first_dev)
670 max_comp = dev - first_dev;
671 } else {
672 cur_len = stripe_unit;
673 }
674 if (cur_len >= length)
675 cur_len = length;
676
677 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
678 cur_len, gfp_flags);
679 if (unlikely(ret))
680 goto out;
681
682 dev += mirrors_p1;
683 dev = (dev % devs_in_group) + first_dev;
684
685 length -= cur_len;
686 ios->length += cur_len;
687 }
688out:
689 ios->numdevs = max_comp + mirrors_p1;
690 *last_pg = cur_pg;
691 return ret;
692}
693
694static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
695{
696 u64 length = ios->ol_state.count;
697 u64 offset = ios->ol_state.offset;
698 struct _striping_info si;
699 unsigned last_pg = 0;
700 int ret = 0;
701
702 while (length) {
703 _calc_stripe_info(ios, offset, &si);
704
705 if (length < si.group_length)
706 si.group_length = length;
707
708 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
709 if (unlikely(ret))
710 goto out;
711
712 offset += si.group_length;
713 length -= si.group_length;
714 }
715
716out:
717 if (!ios->length)
718 return ret;
719
720 return 0;
721}
722
723static ssize_t _sync_done(struct objio_state *ios)
724{
725 struct completion *waiting = ios->private;
726
727 complete(waiting);
728 return 0;
729}
730
731static void _last_io(struct kref *kref)
732{
733 struct objio_state *ios = container_of(kref, struct objio_state, kref);
734
735 ios->done(ios);
736}
737
738static void _done_io(struct osd_request *or, void *p)
739{
740 struct objio_state *ios = p;
741
742 kref_put(&ios->kref, _last_io);
743}
744
745static ssize_t _io_exec(struct objio_state *ios)
746{
747 DECLARE_COMPLETION_ONSTACK(wait);
748 ssize_t status = 0; /* sync status */
749 unsigned i;
750 objio_done_fn saved_done_fn = ios->done;
751 bool sync = ios->ol_state.sync;
752
753 if (sync) {
754 ios->done = _sync_done;
755 ios->private = &wait;
756 }
757
758 kref_init(&ios->kref);
759
760 for (i = 0; i < ios->numdevs; i++) {
761 struct osd_request *or = ios->per_dev[i].or;
762
763 if (!or)
764 continue;
765
766 kref_get(&ios->kref);
767 osd_execute_request_async(or, _done_io, ios);
768 }
769
770 kref_put(&ios->kref, _last_io);
771
772 if (sync) {
773 wait_for_completion(&wait);
774 status = saved_done_fn(ios);
775 }
776
777 return status;
778} 400}
779 401
780/* 402/*
781 * read 403 * read
782 */ 404 */
783static ssize_t _read_done(struct objio_state *ios) 405static void _read_done(struct ore_io_state *ios, void *private)
784{ 406{
407 struct objio_state *objios = private;
785 ssize_t status; 408 ssize_t status;
786 int ret = _io_check(ios, false); 409 int ret = ore_check_io(ios, &__on_dev_error);
787 410
788 _io_free(ios); 411 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
789 412
790 if (likely(!ret)) 413 if (likely(!ret))
791 status = ios->length; 414 status = ios->length;
792 else 415 else
793 status = ret; 416 status = ret;
794 417
795 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); 418 objlayout_read_done(&objios->oir, status, objios->sync);
796 return status;
797} 419}
798 420
799static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) 421int objio_read_pagelist(struct nfs_read_data *rdata)
800{ 422{
801 struct osd_request *or = NULL; 423 struct objio_state *objios;
802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
803 unsigned dev = per_dev->dev;
804 struct pnfs_osd_object_cred *cred =
805 &ios->layout->comps[cur_comp];
806 struct osd_obj_id obj = {
807 .partition = cred->oc_object_id.oid_partition_id,
808 .id = cred->oc_object_id.oid_object_id,
809 };
810 int ret; 424 int ret;
811 425
812 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); 426 ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
813 if (unlikely(!or)) { 427 rdata->lseg, rdata->args.pages, rdata->args.pgbase,
814 ret = -ENOMEM; 428 rdata->args.offset, rdata->args.count, rdata,
815 goto err; 429 GFP_KERNEL, &objios);
816 }
817 per_dev->or = or;
818
819 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
820
821 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
822 if (ret) {
823 dprintk("%s: Faild to osd_finalize_request() => %d\n",
824 __func__, ret);
825 goto err;
826 }
827
828 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
829 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
830 per_dev->length);
831
832err:
833 return ret;
834}
835
836static ssize_t _read_exec(struct objio_state *ios)
837{
838 unsigned i;
839 int ret;
840
841 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
842 if (!ios->per_dev[i].length)
843 continue;
844 ret = _read_mirrors(ios, i);
845 if (unlikely(ret))
846 goto err;
847 }
848
849 ios->done = _read_done;
850 return _io_exec(ios); /* In sync mode exec returns the io status */
851
852err:
853 _io_free(ios);
854 return ret;
855}
856
857ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
858{
859 struct objio_state *ios = container_of(ol_state, struct objio_state,
860 ol_state);
861 int ret;
862
863 ret = _io_rw_pagelist(ios, GFP_KERNEL);
864 if (unlikely(ret)) 430 if (unlikely(ret))
865 return ret; 431 return ret;
866 432
867 return _read_exec(ios); 433 objios->ios->done = _read_done;
434 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
435 rdata->args.offset, rdata->args.count);
436 return ore_read(objios->ios);
868} 437}
869 438
870/* 439/*
871 * write 440 * write
872 */ 441 */
873static ssize_t _write_done(struct objio_state *ios) 442static void _write_done(struct ore_io_state *ios, void *private)
874{ 443{
444 struct objio_state *objios = private;
875 ssize_t status; 445 ssize_t status;
876 int ret = _io_check(ios, true); 446 int ret = ore_check_io(ios, &__on_dev_error);
877 447
878 _io_free(ios); 448 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
879 449
880 if (likely(!ret)) { 450 if (likely(!ret)) {
881 /* FIXME: should be based on the OSD's persistence model 451 /* FIXME: should be based on the OSD's persistence model
882 * See OSD2r05 Section 4.13 Data persistence model */ 452 * See OSD2r05 Section 4.13 Data persistence model */
883 ios->ol_state.committed = NFS_FILE_SYNC; 453 objios->oir.committed = NFS_FILE_SYNC;
884 status = ios->length; 454 status = ios->length;
885 } else { 455 } else {
886 status = ret; 456 status = ret;
887 } 457 }
888 458
889 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); 459 objlayout_write_done(&objios->oir, status, objios->sync);
890 return status;
891} 460}
892 461
893static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) 462static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
894{ 463{
895 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; 464 struct objio_state *objios = priv;
896 unsigned dev = ios->per_dev[cur_comp].dev; 465 struct nfs_write_data *wdata = objios->oir.rpcdata;
897 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 466 pgoff_t index = offset / PAGE_SIZE;
898 int ret; 467 struct page *page = find_get_page(wdata->inode->i_mapping, index);
899
900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
901 struct osd_request *or = NULL;
902 struct pnfs_osd_object_cred *cred =
903 &ios->layout->comps[cur_comp];
904 struct osd_obj_id obj = {
905 .partition = cred->oc_object_id.oid_partition_id,
906 .id = cred->oc_object_id.oid_object_id,
907 };
908 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
909 struct bio *bio;
910
911 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
912 if (unlikely(!or)) {
913 ret = -ENOMEM;
914 goto err;
915 }
916 per_dev->or = or;
917
918 if (per_dev != master_dev) {
919 bio = bio_kmalloc(GFP_NOFS,
920 master_dev->bio->bi_max_vecs);
921 if (unlikely(!bio)) {
922 dprintk("Faild to allocate BIO size=%u\n",
923 master_dev->bio->bi_max_vecs);
924 ret = -ENOMEM;
925 goto err;
926 }
927
928 __bio_clone(bio, master_dev->bio);
929 bio->bi_bdev = NULL;
930 bio->bi_next = NULL;
931 per_dev->bio = bio;
932 per_dev->dev = dev;
933 per_dev->length = master_dev->length;
934 per_dev->offset = master_dev->offset;
935 } else {
936 bio = master_dev->bio;
937 bio->bi_rw |= REQ_WRITE;
938 }
939
940 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
941 468
942 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); 469 if (!page) {
943 if (ret) { 470 page = find_or_create_page(wdata->inode->i_mapping,
944 dprintk("%s: Faild to osd_finalize_request() => %d\n", 471 index, GFP_NOFS);
945 __func__, ret); 472 if (unlikely(!page)) {
946 goto err; 473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
474 __func__, index);
475 return NULL;
947 } 476 }
948 477 unlock_page(page);
949 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
950 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
951 per_dev->length);
952 } 478 }
479 if (PageDirty(page) || PageWriteback(page))
480 *uptodate = true;
481 else
482 *uptodate = PageUptodate(page);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
484 return page;
485}
953 486
954err: 487static void __r4w_put_page(void *priv, struct page *page)
955 return ret; 488{
489 dprintk("%s: index=0x%lx\n", __func__, page->index);
490 page_cache_release(page);
491 return;
956} 492}
957 493
958static ssize_t _write_exec(struct objio_state *ios) 494static const struct _ore_r4w_op _r4w_op = {
495 .get_page = &__r4w_get_page,
496 .put_page = &__r4w_put_page,
497};
498
499int objio_write_pagelist(struct nfs_write_data *wdata, int how)
959{ 500{
960 unsigned i; 501 struct objio_state *objios;
961 int ret; 502 int ret;
962 503
963 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 504 ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
964 if (!ios->per_dev[i].length) 505 wdata->lseg, wdata->args.pages, wdata->args.pgbase,
965 continue; 506 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
966 ret = _write_mirrors(ios, i); 507 &objios);
967 if (unlikely(ret)) 508 if (unlikely(ret))
968 goto err; 509 return ret;
969 }
970
971 ios->done = _write_done;
972 return _io_exec(ios); /* In sync mode exec returns the io->status */
973 510
974err: 511 objios->sync = 0 != (how & FLUSH_SYNC);
975 _io_free(ios); 512 objios->ios->r4w = &_r4w_op;
976 return ret;
977}
978 513
979ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) 514 if (!objios->sync)
980{ 515 objios->ios->done = _write_done;
981 struct objio_state *ios = container_of(ol_state, struct objio_state,
982 ol_state);
983 int ret;
984 516
985 /* TODO: ios->stable = stable; */ 517 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
986 ret = _io_rw_pagelist(ios, GFP_NOFS); 518 wdata->args.offset, wdata->args.count);
519 ret = ore_write(objios->ios);
987 if (unlikely(ret)) 520 if (unlikely(ret))
988 return ret; 521 return ret;
989 522
990 return _write_exec(ios); 523 if (objios->sync)
524 _write_done(objios->ios, objios);
525
526 return 0;
991} 527}
992 528
993static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, 529static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
997 return false; 533 return false;
998 534
999 return pgio->pg_count + req->wb_bytes <= 535 return pgio->pg_count + req->wb_bytes <=
1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 536 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
1001} 537}
1002 538
1003static const struct nfs_pageio_ops objio_pg_read_ops = { 539static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2ade..72074e3a04f 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
156 return end > start ? end - 1 : NFS4_MAX_UINT64; 156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157} 157}
158 158
159static struct objlayout_io_state * 159void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, 160 struct page ***p_pages, unsigned *p_pgbase,
161 struct page **pages, 161 u64 offset, unsigned long count)
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{ 162{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset; 163 u64 lseg_end_offset;
171 164
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset); 165 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset, 166 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length); 167 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset); 168 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) { 169 WARN_ON(offset + count > lseg_end_offset);
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185 170
186 if (pgbase > PAGE_SIZE) { 171 if (*p_pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT; 172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
188 pgbase &= ~PAGE_MASK; 173 *p_pages += *p_pgbase >> PAGE_SHIFT;
174 *p_pgbase &= ~PAGE_MASK;
189 } 175 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212} 176}
213 177
214/* 178/*
215 * I/O done common code 179 * I/O done common code
216 */ 180 */
217static void 181static void
218objlayout_iodone(struct objlayout_io_state *state) 182objlayout_iodone(struct objlayout_io_res *oir)
219{ 183{
220 dprintk("%s: state %p status\n", __func__, state); 184 if (likely(oir->status >= 0)) {
221 185 objio_free_result(oir);
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else { 186 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); 187 struct objlayout *objlay = oir->objlay;
226 188
227 spin_lock(&objlay->lock); 189 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID; 190 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list); 191 list_add(&objlay->err_list, &oir->err_list);
230 spin_unlock(&objlay->lock); 192 spin_unlock(&objlay->lock);
231 } 193 }
232} 194}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
238 * the error for later reporting at layout-return. 200 * the error for later reporting at layout-return.
239 */ 201 */
240void 202void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, 203objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error, 204 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write) 205 u64 offset, u64 length, bool is_write)
244{ 206{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; 207 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
246 208
247 BUG_ON(index >= state->num_comps); 209 BUG_ON(index >= oir->num_comps);
248 if (osd_error) { 210 if (osd_error) {
249 ioerr->oer_component = *pooid; 211 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset; 212 ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
285} 247}
286 248
287void 249void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) 250objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
289{ 251{
290 int eof = state->eof; 252 struct nfs_read_data *rdata = oir->rpcdata;
291 struct nfs_read_data *rdata;
292 253
293 state->status = status; 254 oir->status = rdata->task.tk_status = status;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); 255 if (status >= 0)
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status; 256 rdata->res.count = status;
299 rdata->res.eof = eof; 257 objlayout_iodone(oir);
300 } 258 /* must not use oir after this point */
301 objlayout_iodone(state); 259
302 /* must not use state after this point */ 260 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
261 status, rdata->res.eof, sync);
303 262
304 if (sync) 263 if (sync)
305 pnfs_ld_read_done(rdata); 264 pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
317{ 276{
318 loff_t offset = rdata->args.offset; 277 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count; 278 size_t count = rdata->args.count;
320 struct objlayout_io_state *state; 279 int err;
321 ssize_t status = 0;
322 loff_t eof; 280 loff_t eof;
323 281
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode); 282 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) { 283 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) { 284 if (offset >= eof) {
330 status = 0; 285 err = 0;
331 rdata->res.count = 0; 286 rdata->res.count = 0;
332 rdata->res.eof = 1; 287 rdata->res.eof = 1;
288 /*FIXME: do we need to call pnfs_ld_read_done() */
333 goto out; 289 goto out;
334 } 290 }
335 count = eof - offset; 291 count = eof - offset;
336 } 292 }
337 293
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, 294 rdata->res.eof = (offset + count) >= eof;
339 rdata->args.pages, rdata->args.pgbase, 295 _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
340 offset, count, 296 &rdata->args.pgbase,
341 rdata->lseg, rdata, 297 rdata->args.offset, rdata->args.count);
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347 298
348 state->eof = state->offset + state->count >= eof; 299 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
300 __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
349 301
350 status = objio_read_pagelist(state); 302 err = objio_read_pagelist(rdata);
351 out: 303 out:
352 dprintk("%s: Return status %Zd\n", __func__, status); 304 if (unlikely(err)) {
353 rdata->pnfs_error = status; 305 rdata->pnfs_error = err;
306 dprintk("%s: Returned Error %d\n", __func__, err);
307 return PNFS_NOT_ATTEMPTED;
308 }
354 return PNFS_ATTEMPTED; 309 return PNFS_ATTEMPTED;
355} 310}
356 311
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
371} 326}
372 327
373void 328void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status, 329objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
375 bool sync)
376{ 330{
377 struct nfs_write_data *wdata; 331 struct nfs_write_data *wdata = oir->rpcdata;
378 332
379 dprintk("%s: Begin\n", __func__); 333 oir->status = wdata->task.tk_status = status;
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) { 334 if (status >= 0) {
384 wdata->res.count = status; 335 wdata->res.count = status;
385 wdata->verf.committed = state->committed; 336 wdata->verf.committed = oir->committed;
386 dprintk("%s: Return status %d committed %d\n", 337 }
387 __func__, wdata->task.tk_status, 338 objlayout_iodone(oir);
388 wdata->verf.committed); 339 /* must not use oir after this point */
389 } else 340
390 dprintk("%s: Return status %d\n", 341 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
391 __func__, wdata->task.tk_status); 342 status, wdata->verf.committed, sync);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394 343
395 if (sync) 344 if (sync)
396 pnfs_ld_write_done(wdata); 345 pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata, 356objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how) 357 int how)
409{ 358{
410 struct objlayout_io_state *state; 359 int err;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427 360
428 state->sync = how & FLUSH_SYNC; 361 _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
362 &wdata->args.pgbase,
363 wdata->args.offset, wdata->args.count);
429 364
430 status = objio_write_pagelist(state, how & FLUSH_STABLE); 365 err = objio_write_pagelist(wdata, how);
431 out: 366 if (unlikely(err)) {
432 dprintk("%s: Return status %Zd\n", __func__, status); 367 wdata->pnfs_error = err;
433 wdata->pnfs_error = status; 368 dprintk("%s: Returned Error %d\n", __func__, err);
369 return PNFS_NOT_ATTEMPTED;
370 }
434 return PNFS_ATTEMPTED; 371 return PNFS_ATTEMPTED;
435} 372}
436 373
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
537static void 474static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p) 475encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{ 476{
540 struct objlayout_io_state *state, *tmp; 477 struct objlayout_io_res *oir, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; 478 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542 479
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 480 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
544 unsigned i; 481 unsigned i;
545 482
546 for (i = 0; i < state->num_comps; i++) { 483 for (i = 0; i < oir->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 484 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
548 485
549 if (!ioerr->oer_errno) 486 if (!ioerr->oer_errno)
550 continue; 487 continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
563 500
564 merge_ioerr(&accumulated_err, ioerr); 501 merge_ioerr(&accumulated_err, ioerr);
565 } 502 }
566 list_del(&state->err_list); 503 list_del(&oir->err_list);
567 objlayout_free_io_state(state); 504 objio_free_result(oir);
568 } 505 }
569 506
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); 507 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
576 const struct nfs4_layoutreturn_args *args) 513 const struct nfs4_layoutreturn_args *args)
577{ 514{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay); 515 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp; 516 struct objlayout_io_res *oir, *tmp;
580 __be32 *start; 517 __be32 *start;
581 518
582 dprintk("%s: Begin\n", __func__); 519 dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
585 522
586 spin_lock(&objlay->lock); 523 spin_lock(&objlay->lock);
587 524
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 525 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p; 526 __be32 *last_xdr = NULL, *p;
590 unsigned i; 527 unsigned i;
591 int res = 0; 528 int res = 0;
592 529
593 for (i = 0; i < state->num_comps; i++) { 530 for (i = 0; i < oir->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 531 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
595 532
596 if (!ioerr->oer_errno) 533 if (!ioerr->oer_errno)
597 continue; 534 continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
615 } 552 }
616 553
617 last_xdr = p; 554 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); 555 pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
619 } 556 }
620 557
621 /* TODO: use xdr_write_pages */ 558 /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
631 encode_accumulated_error(objlay, last_xdr); 568 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done; 569 goto loop_done;
633 } 570 }
634 list_del(&state->err_list); 571 list_del(&oir->err_list);
635 objlayout_free_io_state(state); 572 objio_free_result(oir);
636 } 573 }
637loop_done: 574loop_done:
638 spin_unlock(&objlay->lock); 575 spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042..8ec34727ed2 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
74 * per-I/O operation state 74 * per-I/O operation state
75 * embedded in objects provider io_state data structure 75 * embedded in objects provider io_state data structure
76 */ 76 */
77struct objlayout_io_state { 77struct objlayout_io_res {
78 struct pnfs_layout_segment *lseg; 78 struct objlayout *objlay;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86 79
87 void *rpcdata; 80 void *rpcdata;
88 int status; /* res */ 81 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */ 82 int committed; /* res */
91 83
92 /* Error reporting (layout_return) */ 84 /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
100 struct pnfs_osd_ioerr *ioerrs; 92 struct pnfs_osd_ioerr *ioerrs;
101}; 93};
102 94
95static inline
96void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
97 struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
98 struct pnfs_layout_hdr *pnfs_layout_type)
99{
100 oir->objlay = OBJLAYOUT(pnfs_layout_type);
101 oir->rpcdata = rpcdata;
102 INIT_LIST_HEAD(&oir->err_list);
103 oir->num_comps = num_comps;
104 oir->ioerrs = ioerrs;
105}
106
103/* 107/*
104 * Raid engine I/O API 108 * Raid engine I/O API
105 */ 109 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
110 gfp_t gfp_flags); 114 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg); 115extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112 116
113extern int objio_alloc_io_state( 117/* objio_free_result will free these @oir structs recieved from
114 struct pnfs_layout_segment *lseg, 118 * objlayout_{read,write}_done
115 struct objlayout_io_state **outp, 119 */
116 gfp_t gfp_flags); 120extern void objio_free_result(struct objlayout_io_res *oir);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118 121
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); 122extern int objio_read_pagelist(struct nfs_read_data *rdata);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, 123extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
121 bool stable);
122 124
123/* 125/*
124 * callback API 126 * callback API
125 */ 127 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state, 128extern void objlayout_io_set_result(struct objlayout_io_res *oir,
127 unsigned index, struct pnfs_osd_objid *pooid, 129 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write); 130 int osd_error, u64 offset, u64 length, bool is_write);
129 131
130static inline void 132static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) 133objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
132{ 134{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was 135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate 136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported. 137 * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
144 spin_unlock(&objlay->lock); 144 spin_unlock(&objlay->lock);
145} 145}
146 146
147extern void objlayout_read_done(struct objlayout_io_state *state, 147extern void objlayout_read_done(struct objlayout_io_res *oir,
148 ssize_t status, bool sync); 148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, 152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1..5668f7c54c4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
18#include <linux/nfs_page.h> 18#include <linux/nfs_page.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/export.h>
21 22
22#include "internal.h" 23#include "internal.h"
23#include "pnfs.h" 24#include "pnfs.h"
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
41 42
42/** 43/**
43 * nfs_create_request - Create an NFS read/write request. 44 * nfs_create_request - Create an NFS read/write request.
44 * @file: file descriptor to use 45 * @ctx: open context to use
45 * @inode: inode to which the request is attached 46 * @inode: inode to which the request is attached
46 * @page: page to write 47 * @page: page to write
47 * @offset: starting offset within the page for the write 48 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c3..baf73536bc0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h> 31#include <linux/nfs_page.h>
32#include <linux/module.h>
32#include "internal.h" 33#include "internal.h"
33#include "pnfs.h" 34#include "pnfs.h"
34#include "iostat.h" 35#include "iostat.h"
@@ -1168,23 +1169,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1168/* 1169/*
1169 * Called by non rpc-based layout drivers 1170 * Called by non rpc-based layout drivers
1170 */ 1171 */
1171int 1172void pnfs_ld_write_done(struct nfs_write_data *data)
1172pnfs_ld_write_done(struct nfs_write_data *data)
1173{ 1173{
1174 int status; 1174 if (likely(!data->pnfs_error)) {
1175
1176 if (!data->pnfs_error) {
1177 pnfs_set_layoutcommit(data); 1175 pnfs_set_layoutcommit(data);
1178 data->mds_ops->rpc_call_done(&data->task, data); 1176 data->mds_ops->rpc_call_done(&data->task, data);
1179 data->mds_ops->rpc_release(data); 1177 } else {
1180 return 0; 1178 put_lseg(data->lseg);
1179 data->lseg = NULL;
1180 dprintk("pnfs write error = %d\n", data->pnfs_error);
1181 } 1181 }
1182 1182 data->mds_ops->rpc_release(data);
1183 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1184 data->pnfs_error);
1185 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1186 data->mds_ops, NFS_FILE_SYNC);
1187 return status ? : -EAGAIN;
1188} 1183}
1189EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1184EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1190 1185
@@ -1268,23 +1263,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1268/* 1263/*
1269 * Called by non rpc-based layout drivers 1264 * Called by non rpc-based layout drivers
1270 */ 1265 */
1271int 1266void pnfs_ld_read_done(struct nfs_read_data *data)
1272pnfs_ld_read_done(struct nfs_read_data *data)
1273{ 1267{
1274 int status; 1268 if (likely(!data->pnfs_error)) {
1275
1276 if (!data->pnfs_error) {
1277 __nfs4_read_done_cb(data); 1269 __nfs4_read_done_cb(data);
1278 data->mds_ops->rpc_call_done(&data->task, data); 1270 data->mds_ops->rpc_call_done(&data->task, data);
1279 data->mds_ops->rpc_release(data); 1271 } else {
1280 return 0; 1272 put_lseg(data->lseg);
1273 data->lseg = NULL;
1274 dprintk("pnfs write error = %d\n", data->pnfs_error);
1281 } 1275 }
1282 1276 data->mds_ops->rpc_release(data);
1283 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1284 data->pnfs_error);
1285 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1286 data->mds_ops);
1287 return status ? : -EAGAIN;
1288} 1277}
1289EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1278EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1290 1279
@@ -1381,6 +1370,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1381 } 1370 }
1382} 1371}
1383 1372
1373void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1374{
1375 if (lseg->pls_range.iomode == IOMODE_RW) {
1376 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1377 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1378 } else {
1379 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1380 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1381 }
1382}
1383EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1384
1384void 1385void
1385pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1386pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1386{ 1387{
@@ -1443,17 +1444,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1443 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1444 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1444 data = kzalloc(sizeof(*data), GFP_NOFS); 1445 data = kzalloc(sizeof(*data), GFP_NOFS);
1445 if (!data) { 1446 if (!data) {
1446 mark_inode_dirty_sync(inode);
1447 status = -ENOMEM; 1447 status = -ENOMEM;
1448 goto out; 1448 goto out;
1449 } 1449 }
1450 1450
1451 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1452 goto out_free;
1453
1454 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1455 if (!sync) {
1456 status = -EAGAIN;
1457 goto out_free;
1458 }
1459 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1460 nfs_wait_bit_killable, TASK_KILLABLE);
1461 if (status)
1462 goto out_free;
1463 }
1464
1451 INIT_LIST_HEAD(&data->lseg_list); 1465 INIT_LIST_HEAD(&data->lseg_list);
1452 spin_lock(&inode->i_lock); 1466 spin_lock(&inode->i_lock);
1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1467 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1468 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1454 spin_unlock(&inode->i_lock); 1469 spin_unlock(&inode->i_lock);
1455 kfree(data); 1470 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1456 goto out; 1471 goto out_free;
1457 } 1472 }
1458 1473
1459 pnfs_list_write_lseg(inode, &data->lseg_list); 1474 pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1490,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1475 1490
1476 status = nfs4_proc_layoutcommit(data, sync); 1491 status = nfs4_proc_layoutcommit(data, sync);
1477out: 1492out:
1493 if (status)
1494 mark_inode_dirty_sync(inode);
1478 dprintk("<-- %s status %d\n", __func__, status); 1495 dprintk("<-- %s status %d\n", __func__, status);
1479 return status; 1496 return status;
1497out_free:
1498 kfree(data);
1499 goto out;
1480} 1500}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3c..1509530cb11 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); 178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
181void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
181int pnfs_layout_process(struct nfs4_layoutget *lgp); 182int pnfs_layout_process(struct nfs4_layoutget *lgp);
182void pnfs_free_lseg_list(struct list_head *tmp_list); 183void pnfs_free_lseg_list(struct list_head *tmp_list);
183void pnfs_destroy_layout(struct nfs_inode *); 184void pnfs_destroy_layout(struct nfs_inode *);
@@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
200void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 201void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
201int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 202int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
202int _pnfs_return_layout(struct inode *); 203int _pnfs_return_layout(struct inode *);
203int pnfs_ld_write_done(struct nfs_write_data *); 204void pnfs_ld_write_done(struct nfs_write_data *);
204int pnfs_ld_read_done(struct nfs_read_data *); 205void pnfs_ld_read_done(struct nfs_read_data *);
205struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 206struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
206 struct nfs_open_context *ctx, 207 struct nfs_open_context *ctx,
207 loff_t pos, 208 loff_t pos,
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef5..4f359d2a26e 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
28 * such damages. 28 * such damages.
29 */ 29 */
30 30
31#include <linux/export.h>
31#include "pnfs.h" 32#include "pnfs.h"
32 33
33#define NFSDBG_FACILITY NFSDBG_PNFS 34#define NFSDBG_FACILITY NFSDBG_PNFS
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab0..8b48ec63f72 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
35static const struct rpc_call_ops nfs_read_full_ops; 35static const struct rpc_call_ops nfs_read_full_ops;
36 36
37static struct kmem_cache *nfs_rdata_cachep; 37static struct kmem_cache *nfs_rdata_cachep;
38static mempool_t *nfs_rdata_mempool;
39
40#define MIN_POOL_READ (32)
41 38
42struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 39struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
43{ 40{
44 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); 41 struct nfs_read_data *p;
45 42
43 p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
46 if (p) { 44 if (p) {
47 memset(p, 0, sizeof(*p));
48 INIT_LIST_HEAD(&p->pages); 45 INIT_LIST_HEAD(&p->pages);
49 p->npages = pagecount; 46 p->npages = pagecount;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 47 if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
52 else { 49 else {
53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); 50 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
54 if (!p->pagevec) { 51 if (!p->pagevec) {
55 mempool_free(p, nfs_rdata_mempool); 52 kmem_cache_free(nfs_rdata_cachep, p);
56 p = NULL; 53 p = NULL;
57 } 54 }
58 } 55 }
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
64{ 61{
65 if (p && (p->pagevec != &p->page_array[0])) 62 if (p && (p->pagevec != &p->page_array[0]))
66 kfree(p->pagevec); 63 kfree(p->pagevec);
67 mempool_free(p, nfs_rdata_mempool); 64 kmem_cache_free(nfs_rdata_cachep, p);
68} 65}
69 66
70void nfs_readdata_release(struct nfs_read_data *rdata) 67void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head)
276 while (!list_empty(head)) { 273 while (!list_empty(head)) {
277 req = nfs_list_entry(head->next); 274 req = nfs_list_entry(head->next);
278 nfs_list_remove_request(req); 275 nfs_list_remove_request(req);
279 SetPageError(req->wb_page);
280 nfs_readpage_release(req); 276 nfs_readpage_release(req);
281 } 277 }
282} 278}
@@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
322 offset += len; 318 offset += len;
323 } while(nbytes != 0); 319 } while(nbytes != 0);
324 atomic_set(&req->wb_complete, requests); 320 atomic_set(&req->wb_complete, requests);
325 ClearPageError(page);
326 desc->pg_rpc_callops = &nfs_read_partial_ops; 321 desc->pg_rpc_callops = &nfs_read_partial_ops;
327 return ret; 322 return ret;
328out_bad: 323out_bad:
@@ -331,7 +326,6 @@ out_bad:
331 list_del(&data->list); 326 list_del(&data->list);
332 nfs_readdata_free(data); 327 nfs_readdata_free(data);
333 } 328 }
334 SetPageError(page);
335 nfs_readpage_release(req); 329 nfs_readpage_release(req);
336 return -ENOMEM; 330 return -ENOMEM;
337} 331}
@@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
357 req = nfs_list_entry(head->next); 351 req = nfs_list_entry(head->next);
358 nfs_list_remove_request(req); 352 nfs_list_remove_request(req);
359 nfs_list_add_request(req, &data->pages); 353 nfs_list_add_request(req, &data->pages);
360 ClearPageError(req->wb_page);
361 *pages++ = req->wb_page; 354 *pages++ = req->wb_page;
362 } 355 }
363 req = nfs_list_entry(data->pages.next); 356 req = nfs_list_entry(data->pages.next);
@@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
435 argp->offset += resp->count; 428 argp->offset += resp->count;
436 argp->pgbase += resp->count; 429 argp->pgbase += resp->count;
437 argp->count -= resp->count; 430 argp->count -= resp->count;
438 nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); 431 rpc_restart_call_prepare(task);
439} 432}
440 433
441/* 434/*
@@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata)
462 int status = data->task.tk_status; 455 int status = data->task.tk_status;
463 456
464 if (status < 0) 457 if (status < 0)
465 SetPageError(page); 458 set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
466 459
467 if (atomic_dec_and_test(&req->wb_complete)) { 460 if (atomic_dec_and_test(&req->wb_complete)) {
468 if (!PageError(page)) 461 if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
469 SetPageUptodate(page); 462 SetPageUptodate(page);
470 nfs_readpage_release(req); 463 nfs_readpage_release(req);
471 } 464 }
@@ -541,13 +534,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
541static void nfs_readpage_release_full(void *calldata) 534static void nfs_readpage_release_full(void *calldata)
542{ 535{
543 struct nfs_read_data *data = calldata; 536 struct nfs_read_data *data = calldata;
537 struct nfs_pageio_descriptor pgio;
544 538
539 if (data->pnfs_error) {
540 nfs_pageio_init_read_mds(&pgio, data->inode);
541 pgio.pg_recoalesce = 1;
542 }
545 while (!list_empty(&data->pages)) { 543 while (!list_empty(&data->pages)) {
546 struct nfs_page *req = nfs_list_entry(data->pages.next); 544 struct nfs_page *req = nfs_list_entry(data->pages.next);
547 545
548 nfs_list_remove_request(req); 546 nfs_list_remove_request(req);
549 nfs_readpage_release(req); 547 if (!data->pnfs_error)
548 nfs_readpage_release(req);
549 else
550 nfs_pageio_add_request(&pgio, req);
550 } 551 }
552 if (data->pnfs_error)
553 nfs_pageio_complete(&pgio);
551 nfs_readdata_release(calldata); 554 nfs_readdata_release(calldata);
552} 555}
553 556
@@ -648,7 +651,6 @@ readpage_async_filler(void *data, struct page *page)
648 return 0; 651 return 0;
649out_error: 652out_error:
650 error = PTR_ERR(new); 653 error = PTR_ERR(new);
651 SetPageError(page);
652out_unlock: 654out_unlock:
653 unlock_page(page); 655 unlock_page(page);
654 return error; 656 return error;
@@ -711,16 +713,10 @@ int __init nfs_init_readpagecache(void)
711 if (nfs_rdata_cachep == NULL) 713 if (nfs_rdata_cachep == NULL)
712 return -ENOMEM; 714 return -ENOMEM;
713 715
714 nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
715 nfs_rdata_cachep);
716 if (nfs_rdata_mempool == NULL)
717 return -ENOMEM;
718
719 return 0; 716 return 0;
720} 717}
721 718
722void nfs_destroy_readpagecache(void) 719void nfs_destroy_readpagecache(void)
723{ 720{
724 mempool_destroy(nfs_rdata_mempool);
725 kmem_cache_destroy(nfs_rdata_cachep); 721 kmem_cache_destroy(nfs_rdata_cachep);
726} 722}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b19b6aabe1..480b3b6bf71 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
733 733
734 return 0; 734 return 0;
735} 735}
736
737#ifdef CONFIG_NFS_V4
736#ifdef CONFIG_NFS_V4_1 738#ifdef CONFIG_NFS_V4_1
737void show_sessions(struct seq_file *m, struct nfs_server *server) 739static void show_sessions(struct seq_file *m, struct nfs_server *server)
738{ 740{
739 if (nfs4_has_session(server->nfs_client)) 741 if (nfs4_has_session(server->nfs_client))
740 seq_printf(m, ",sessions"); 742 seq_printf(m, ",sessions");
741} 743}
742#else 744#else
743void show_sessions(struct seq_file *m, struct nfs_server *server) {} 745static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
746#endif
744#endif 747#endif
745 748
749#ifdef CONFIG_NFS_V4
746#ifdef CONFIG_NFS_V4_1 750#ifdef CONFIG_NFS_V4_1
747void show_pnfs(struct seq_file *m, struct nfs_server *server) 751static void show_pnfs(struct seq_file *m, struct nfs_server *server)
748{ 752{
749 seq_printf(m, ",pnfs="); 753 seq_printf(m, ",pnfs=");
750 if (server->pnfs_curr_ld) 754 if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
752 else 756 else
753 seq_printf(m, "not configured"); 757 seq_printf(m, "not configured");
754} 758}
755#else /* CONFIG_NFS_V4_1 */ 759#else
756void show_pnfs(struct seq_file *m, struct nfs_server *server) {} 760static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
757#endif /* CONFIG_NFS_V4_1 */ 761#endif
762#endif
758 763
759static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) 764static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
760{ 765{
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e4..4f9319a2e56 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
87 struct inode *dir = data->dir; 87 struct inode *dir = data->dir;
88 88
89 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 89 if (!NFS_PROTO(dir)->unlink_done(task, dir))
90 nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); 90 rpc_restart_call_prepare(task);
91} 91}
92 92
93/** 93/**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
369 struct dentry *new_dentry = data->new_dentry; 369 struct dentry *new_dentry = data->new_dentry;
370 370
371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { 371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
372 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); 372 rpc_restart_call_prepare(task);
373 return; 373 return;
374 } 374 }
375 375
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c9bd2a6b7d4..1dda78db6a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/nfs_page.h> 21#include <linux/nfs_page.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/export.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25 26
@@ -390,7 +391,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 391 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
391 BUG_ON(error); 392 BUG_ON(error);
392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) 393 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
393 nfsi->change_attr++; 394 inode->i_version++;
394 set_bit(PG_MAPPED, &req->wb_flags); 395 set_bit(PG_MAPPED, &req->wb_flags);
395 SetPagePrivate(req->wb_page); 396 SetPagePrivate(req->wb_page);
396 set_page_private(req->wb_page, (unsigned long)req); 397 set_page_private(req->wb_page, (unsigned long)req);
@@ -428,7 +429,6 @@ static void
428nfs_mark_request_dirty(struct nfs_page *req) 429nfs_mark_request_dirty(struct nfs_page *req)
429{ 430{
430 __set_page_dirty_nobuffers(req->wb_page); 431 __set_page_dirty_nobuffers(req->wb_page);
431 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
432} 432}
433 433
434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +762,8 @@ int nfs_updatepage(struct file *file, struct page *page,
762 status = nfs_writepage_setup(ctx, page, offset, count); 762 status = nfs_writepage_setup(ctx, page, offset, count);
763 if (status < 0) 763 if (status < 0)
764 nfs_set_pageerror(page); 764 nfs_set_pageerror(page);
765 else
766 __set_page_dirty_nobuffers(page);
765 767
766 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 768 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
767 status, (long long)i_size_read(inode)); 769 status, (long long)i_size_read(inode));
@@ -1010,7 +1012,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
1010 req = nfs_list_entry(head->next); 1012 req = nfs_list_entry(head->next);
1011 nfs_list_remove_request(req); 1013 nfs_list_remove_request(req);
1012 nfs_list_add_request(req, &data->pages); 1014 nfs_list_add_request(req, &data->pages);
1013 ClearPageError(req->wb_page);
1014 *pages++ = req->wb_page; 1015 *pages++ = req->wb_page;
1015 } 1016 }
1016 req = nfs_list_entry(data->pages.next); 1017 req = nfs_list_entry(data->pages.next);
@@ -1165,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1165static void nfs_writeback_release_full(void *calldata) 1166static void nfs_writeback_release_full(void *calldata)
1166{ 1167{
1167 struct nfs_write_data *data = calldata; 1168 struct nfs_write_data *data = calldata;
1168 int status = data->task.tk_status; 1169 int ret, status = data->task.tk_status;
1170 struct nfs_pageio_descriptor pgio;
1171
1172 if (data->pnfs_error) {
1173 nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
1174 pgio.pg_recoalesce = 1;
1175 }
1169 1176
1170 /* Update attributes as result of writeback. */ 1177 /* Update attributes as result of writeback. */
1171 while (!list_empty(&data->pages)) { 1178 while (!list_empty(&data->pages)) {
@@ -1181,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata)
1181 req->wb_bytes, 1188 req->wb_bytes,
1182 (long long)req_offset(req)); 1189 (long long)req_offset(req));
1183 1190
1191 if (data->pnfs_error) {
1192 dprintk(", pnfs error = %d\n", data->pnfs_error);
1193 goto next;
1194 }
1195
1184 if (status < 0) { 1196 if (status < 0) {
1185 nfs_set_pageerror(page); 1197 nfs_set_pageerror(page);
1186 nfs_context_set_write_error(req->wb_context, status); 1198 nfs_context_set_write_error(req->wb_context, status);
@@ -1200,7 +1212,19 @@ remove_request:
1200 next: 1212 next:
1201 nfs_clear_page_tag_locked(req); 1213 nfs_clear_page_tag_locked(req);
1202 nfs_end_page_writeback(page); 1214 nfs_end_page_writeback(page);
1215 if (data->pnfs_error) {
1216 lock_page(page);
1217 nfs_pageio_cond_complete(&pgio, page->index);
1218 ret = nfs_page_async_flush(&pgio, page, 0);
1219 if (ret) {
1220 nfs_set_pageerror(page);
1221 dprintk("rewrite to MDS error = %d\n", ret);
1222 }
1223 unlock_page(page);
1224 }
1203 } 1225 }
1226 if (data->pnfs_error)
1227 nfs_pageio_complete(&pgio);
1204 nfs_writedata_release(calldata); 1228 nfs_writedata_release(calldata);
1205} 1229}
1206 1230
@@ -1220,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1220{ 1244{
1221 struct nfs_writeargs *argp = &data->args; 1245 struct nfs_writeargs *argp = &data->args;
1222 struct nfs_writeres *resp = &data->res; 1246 struct nfs_writeres *resp = &data->res;
1223 struct nfs_server *server = NFS_SERVER(data->inode);
1224 int status; 1247 int status;
1225 1248
1226 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1254,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1254 if (time_before(complain, jiffies)) { 1277 if (time_before(complain, jiffies)) {
1255 dprintk("NFS: faulty NFS server %s:" 1278 dprintk("NFS: faulty NFS server %s:"
1256 " (committed = %d) != (stable = %d)\n", 1279 " (committed = %d) != (stable = %d)\n",
1257 server->nfs_client->cl_hostname, 1280 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1258 resp->verf->committed, argp->stable); 1281 resp->verf->committed, argp->stable);
1259 complain = jiffies + 300 * HZ; 1282 complain = jiffies + 300 * HZ;
1260 } 1283 }
@@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1281 */ 1304 */
1282 argp->stable = NFS_FILE_SYNC; 1305 argp->stable = NFS_FILE_SYNC;
1283 } 1306 }
1284 nfs_restart_rpc(task, server->nfs_client); 1307 rpc_restart_call_prepare(task);
1285 return; 1308 return;
1286 } 1309 }
1287 if (time_before(complain, jiffies)) { 1310 if (time_before(complain, jiffies)) {
@@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1553 int flags = FLUSH_SYNC; 1576 int flags = FLUSH_SYNC;
1554 int ret = 0; 1577 int ret = 0;
1555 1578
1579 /* no commits means nothing needs to be done */
1580 if (!nfsi->ncommit)
1581 return ret;
1582
1556 if (wbc->sync_mode == WB_SYNC_NONE) { 1583 if (wbc->sync_mode == WB_SYNC_NONE) {
1557 /* Don't commit yet if this is a non-blocking flush and there 1584 /* Don't commit yet if this is a non-blocking flush and there
1558 * are a lot of outstanding writes for this mapping. 1585 * are a lot of outstanding writes for this mapping.
@@ -1686,34 +1713,20 @@ out_error:
1686int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1713int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1687 struct page *page) 1714 struct page *page)
1688{ 1715{
1689 struct nfs_page *req; 1716 /*
1690 int ret; 1717 * If PagePrivate is set, then the page is currently associated with
1718 * an in-progress read or write request. Don't try to migrate it.
1719 *
1720 * FIXME: we could do this in principle, but we'll need a way to ensure
1721 * that we can safely release the inode reference while holding
1722 * the page lock.
1723 */
1724 if (PagePrivate(page))
1725 return -EBUSY;
1691 1726
1692 nfs_fscache_release_page(page, GFP_KERNEL); 1727 nfs_fscache_release_page(page, GFP_KERNEL);
1693 1728
1694 req = nfs_find_and_lock_request(page, false); 1729 return migrate_page(mapping, newpage, page);
1695 ret = PTR_ERR(req);
1696 if (IS_ERR(req))
1697 goto out;
1698
1699 ret = migrate_page(mapping, newpage, page);
1700 if (!req)
1701 goto out;
1702 if (ret)
1703 goto out_unlock;
1704 page_cache_get(newpage);
1705 spin_lock(&mapping->host->i_lock);
1706 req->wb_page = newpage;
1707 SetPagePrivate(newpage);
1708 set_page_private(newpage, (unsigned long)req);
1709 ClearPagePrivate(page);
1710 set_page_private(page, 0);
1711 spin_unlock(&mapping->host->i_lock);
1712 page_cache_release(page);
1713out_unlock:
1714 nfs_clear_page_tag_locked(req);
1715out:
1716 return ret;
1717} 1730}
1718#endif 1731#endif
1719 1732
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc5..62f3b9074e8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/exportfs.h> 17#include <linux/exportfs.h>
18 18
19#include <linux/nfsd/syscall.h>
20#include <net/ipv6.h> 19#include <net/ipv6.h>
21 20
22#include "nfsd.h" 21#include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
318 struct svc_export *exp = container_of(ref, struct svc_export, h.ref); 317 struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
319 path_put(&exp->ex_path); 318 path_put(&exp->ex_path);
320 auth_domain_put(exp->ex_client); 319 auth_domain_put(exp->ex_client);
321 kfree(exp->ex_pathname);
322 nfsd4_fslocs_free(&exp->ex_fslocs); 320 nfsd4_fslocs_free(&exp->ex_fslocs);
323 kfree(exp); 321 kfree(exp);
324} 322}
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
528 526
529 exp.ex_client = dom; 527 exp.ex_client = dom;
530 528
531 err = -ENOMEM;
532 exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
533 if (!exp.ex_pathname)
534 goto out2;
535
536 /* expiry */ 529 /* expiry */
537 err = -EINVAL; 530 err = -EINVAL;
538 exp.h.expiry_time = get_expiry(&mesg); 531 exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
613 nfsd4_fslocs_free(&exp.ex_fslocs); 606 nfsd4_fslocs_free(&exp.ex_fslocs);
614 kfree(exp.ex_uuid); 607 kfree(exp.ex_uuid);
615out3: 608out3:
616 kfree(exp.ex_pathname);
617out2:
618 path_put(&exp.ex_path); 609 path_put(&exp.ex_path);
619out1: 610out1:
620 auth_domain_put(dom); 611 auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
678 new->ex_client = item->ex_client; 669 new->ex_client = item->ex_client;
679 new->ex_path.dentry = dget(item->ex_path.dentry); 670 new->ex_path.dentry = dget(item->ex_path.dentry);
680 new->ex_path.mnt = mntget(item->ex_path.mnt); 671 new->ex_path.mnt = mntget(item->ex_path.mnt);
681 new->ex_pathname = NULL;
682 new->ex_fslocs.locations = NULL; 672 new->ex_fslocs.locations = NULL;
683 new->ex_fslocs.locations_count = 0; 673 new->ex_fslocs.locations_count = 0;
684 new->ex_fslocs.migrated = 0; 674 new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
696 new->ex_fsid = item->ex_fsid; 686 new->ex_fsid = item->ex_fsid;
697 new->ex_uuid = item->ex_uuid; 687 new->ex_uuid = item->ex_uuid;
698 item->ex_uuid = NULL; 688 item->ex_uuid = NULL;
699 new->ex_pathname = item->ex_pathname;
700 item->ex_pathname = NULL;
701 new->ex_fslocs.locations = item->ex_fslocs.locations; 689 new->ex_fslocs.locations = item->ex_fslocs.locations;
702 item->ex_fslocs.locations = NULL; 690 item->ex_fslocs.locations = NULL;
703 new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; 691 new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1010 return exp; 998 return exp;
1011} 999}
1012 1000
1013static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) 1001struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
1014{ 1002{
1015 u32 fsidv[2]; 1003 u32 fsidv[2];
1016 1004
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1030 struct svc_export *exp; 1018 struct svc_export *exp;
1031 __be32 rv; 1019 __be32 rv;
1032 1020
1033 exp = find_fsidzero_export(rqstp); 1021 exp = rqst_find_fsidzero_export(rqstp);
1034 if (IS_ERR(exp)) 1022 if (IS_ERR(exp))
1035 return nfserrno(PTR_ERR(exp)); 1023 return nfserrno(PTR_ERR(exp));
1036 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); 1024 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c..9c51aff02ae 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/export.h>
39#include "acl.h" 40#include "acl.h"
40 41
41 42
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ec..7748d6a18d9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
39 39
40#define NFSDDBG_FACILITY NFSDDBG_PROC 40#define NFSDDBG_FACILITY NFSDDBG_PROC
41 41
42static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
43
42#define NFSPROC4_CB_NULL 0 44#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 45#define NFSPROC4_CB_COMPOUND 1
44 46
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
351 __be32 *p; 353 __be32 *p;
352 354
353 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); 355 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
354 encode_stateid4(xdr, &dp->dl_stateid); 356 encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
355 357
356 p = xdr_reserve_space(xdr, 4); 358 p = xdr_reserve_space(xdr, 4);
357 *p++ = xdr_zero; /* truncate */ 359 *p++ = xdr_zero; /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
460 */ 462 */
461 status = 0; 463 status = 0;
462out: 464out:
465 if (status)
466 nfsd4_mark_cb_fault(cb->cb_clp, status);
463 return status; 467 return status;
464out_overflow: 468out_overflow:
465 print_overflow_msg(__func__, xdr); 469 print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686 warn_no_callback_path(clp, reason); 690 warn_no_callback_path(clp, reason);
687} 691}
688 692
693static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
694{
695 clp->cl_cb_state = NFSD4_CB_FAULT;
696 warn_no_callback_path(clp, reason);
697}
698
689static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 699static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
690{ 700{
691 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); 701 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
787{ 797{
788 struct nfsd4_callback *cb = calldata; 798 struct nfsd4_callback *cb = calldata;
789 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 799 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
790 struct nfs4_client *clp = dp->dl_client; 800 struct nfs4_client *clp = dp->dl_stid.sc_client;
791 u32 minorversion = clp->cl_minorversion; 801 u32 minorversion = clp->cl_minorversion;
792 802
793 cb->cb_minorversion = minorversion; 803 cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
809{ 819{
810 struct nfsd4_callback *cb = calldata; 820 struct nfsd4_callback *cb = calldata;
811 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 821 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
812 struct nfs4_client *clp = dp->dl_client; 822 struct nfs4_client *clp = dp->dl_stid.sc_client;
813 823
814 dprintk("%s: minorversion=%d\n", __func__, 824 dprintk("%s: minorversion=%d\n", __func__,
815 clp->cl_minorversion); 825 clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
832{ 842{
833 struct nfsd4_callback *cb = calldata; 843 struct nfsd4_callback *cb = calldata;
834 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 844 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
835 struct nfs4_client *clp = dp->dl_client; 845 struct nfs4_client *clp = dp->dl_stid.sc_client;
836 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 846 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
837 847
838 nfsd4_cb_done(task, calldata); 848 nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
1006void nfsd4_cb_recall(struct nfs4_delegation *dp) 1016void nfsd4_cb_recall(struct nfs4_delegation *dp)
1007{ 1017{
1008 struct nfsd4_callback *cb = &dp->dl_recall; 1018 struct nfsd4_callback *cb = &dp->dl_recall;
1009 struct nfs4_client *clp = dp->dl_client; 1019 struct nfs4_client *clp = dp->dl_stid.sc_client;
1010 1020
1011 dp->dl_retries = 1; 1021 dp->dl_retries = 1;
1012 cb->cb_op = dp; 1022 cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e8077766661..fa383361bc6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37 37
38#include "idmap.h"
38#include "cache.h" 39#include "cache.h"
39#include "xdr4.h" 40#include "xdr4.h"
40#include "vfs.h" 41#include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
156 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) 157 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
157 return nfserr_inval; 158 return nfserr_inval;
158 159
160 accmode |= NFSD_MAY_READ_IF_EXEC;
161
159 if (open->op_share_access & NFS4_SHARE_ACCESS_READ) 162 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
160 accmode |= NFSD_MAY_READ; 163 accmode |= NFSD_MAY_READ;
161 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 164 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
168 return status; 171 return status;
169} 172}
170 173
174static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
175{
176 umode_t mode = fh->fh_dentry->d_inode->i_mode;
177
178 if (S_ISREG(mode))
179 return nfs_ok;
180 if (S_ISDIR(mode))
181 return nfserr_isdir;
182 /*
183 * Using err_symlink as our catch-all case may look odd; but
184 * there's no other obvious error for this case in 4.0, and we
185 * happen to know that it will cause the linux v4 client to do
186 * the right thing on attempts to open something other than a
187 * regular file.
188 */
189 return nfserr_symlink;
190}
191
171static __be32 192static __be32
172do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 193do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
173{ 194{
174 struct svc_fh resfh; 195 struct svc_fh resfh;
175 __be32 status; 196 __be32 status;
176 int created = 0;
177 197
178 fh_init(&resfh, NFS4_FHSIZE); 198 fh_init(&resfh, NFS4_FHSIZE);
179 open->op_truncate = 0; 199 open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
202 open->op_fname.len, &open->op_iattr, 222 open->op_fname.len, &open->op_iattr,
203 &resfh, open->op_createmode, 223 &resfh, open->op_createmode,
204 (u32 *)open->op_verf.data, 224 (u32 *)open->op_verf.data,
205 &open->op_truncate, &created); 225 &open->op_truncate, &open->op_created);
206 226
207 /* 227 /*
208 * Following rfc 3530 14.2.16, use the returned bitmask 228 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
216 status = nfsd_lookup(rqstp, current_fh, 236 status = nfsd_lookup(rqstp, current_fh,
217 open->op_fname.data, open->op_fname.len, &resfh); 237 open->op_fname.data, open->op_fname.len, &resfh);
218 fh_unlock(current_fh); 238 fh_unlock(current_fh);
239 if (status)
240 goto out;
241 status = nfsd_check_obj_isreg(&resfh);
219 } 242 }
220 if (status) 243 if (status)
221 goto out; 244 goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
227 fh_dup2(current_fh, &resfh); 250 fh_dup2(current_fh, &resfh);
228 251
229 /* set reply cache */ 252 /* set reply cache */
230 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, 253 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
231 &resfh.fh_handle); 254 &resfh.fh_handle);
232 if (!created) 255 if (!open->op_created)
233 status = do_open_permission(rqstp, current_fh, open, 256 status = do_open_permission(rqstp, current_fh, open,
234 NFSD_MAY_NOP); 257 NFSD_MAY_NOP);
235 258
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
254 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 277 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
255 278
256 /* set replay cache */ 279 /* set replay cache */
257 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, 280 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
258 &current_fh->fh_handle); 281 &current_fh->fh_handle);
259 282
260 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 283 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
283 __be32 status; 306 __be32 status;
284 struct nfsd4_compoundres *resp; 307 struct nfsd4_compoundres *resp;
285 308
286 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 309 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
287 (int)open->op_fname.len, open->op_fname.data, 310 (int)open->op_fname.len, open->op_fname.data,
288 open->op_stateowner); 311 open->op_openowner);
289 312
290 /* This check required by spec. */ 313 /* This check required by spec. */
291 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 314 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
292 return nfserr_inval; 315 return nfserr_inval;
293 316
317 /* We don't yet support WANT bits: */
318 open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
319
320 open->op_created = 0;
294 /* 321 /*
295 * RFC5661 18.51.3 322 * RFC5661 18.51.3
296 * Before RECLAIM_COMPLETE done, server should deny new lock 323 * Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
309 resp = rqstp->rq_resp; 336 resp = rqstp->rq_resp;
310 status = nfsd4_process_open1(&resp->cstate, open); 337 status = nfsd4_process_open1(&resp->cstate, open);
311 if (status == nfserr_replay_me) { 338 if (status == nfserr_replay_me) {
312 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 339 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
313 fh_put(&cstate->current_fh); 340 fh_put(&cstate->current_fh);
314 fh_copy_shallow(&cstate->current_fh.fh_handle, 341 fh_copy_shallow(&cstate->current_fh.fh_handle,
315 &rp->rp_openfh); 342 &rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
339 switch (open->op_claim_type) { 366 switch (open->op_claim_type) {
340 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 367 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
341 case NFS4_OPEN_CLAIM_NULL: 368 case NFS4_OPEN_CLAIM_NULL:
342 /*
343 * (1) set CURRENT_FH to the file being opened,
344 * creating it if necessary, (2) set open->op_cinfo,
345 * (3) set open->op_truncate if the file is to be
346 * truncated after opening, (4) do permission checking.
347 */
348 status = do_open_lookup(rqstp, &cstate->current_fh, 369 status = do_open_lookup(rqstp, &cstate->current_fh,
349 open); 370 open);
350 if (status) 371 if (status)
351 goto out; 372 goto out;
352 break; 373 break;
353 case NFS4_OPEN_CLAIM_PREVIOUS: 374 case NFS4_OPEN_CLAIM_PREVIOUS:
354 open->op_stateowner->so_confirmed = 1; 375 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
355 /* 376 case NFS4_OPEN_CLAIM_FH:
356 * The CURRENT_FH is already set to the file being 377 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
357 * opened. (1) set open->op_cinfo, (2) set
358 * open->op_truncate if the file is to be truncated
359 * after opening, (3) do permission checking.
360 */
361 status = do_open_fhandle(rqstp, &cstate->current_fh, 378 status = do_open_fhandle(rqstp, &cstate->current_fh,
362 open); 379 open);
363 if (status) 380 if (status)
364 goto out; 381 goto out;
365 break; 382 break;
383 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
366 case NFS4_OPEN_CLAIM_DELEGATE_PREV: 384 case NFS4_OPEN_CLAIM_DELEGATE_PREV:
367 open->op_stateowner->so_confirmed = 1; 385 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
368 dprintk("NFSD: unsupported OPEN claim type %d\n", 386 dprintk("NFSD: unsupported OPEN claim type %d\n",
369 open->op_claim_type); 387 open->op_claim_type);
370 status = nfserr_notsupp; 388 status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
381 * set, (2) sets open->op_stateid, (3) sets open->op_delegation. 399 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
382 */ 400 */
383 status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); 401 status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
402 WARN_ON(status && open->op_created);
384out: 403out:
385 if (open->op_stateowner) { 404 nfsd4_cleanup_open_state(open, status);
386 nfs4_get_stateowner(open->op_stateowner); 405 if (open->op_openowner)
387 cstate->replay_owner = open->op_stateowner; 406 cstate->replay_owner = &open->op_openowner->oo_owner;
388 } 407 else
389 nfs4_unlock_state(); 408 nfs4_unlock_state();
390 return status; 409 return status;
391} 410}
392 411
@@ -467,17 +486,12 @@ static __be32
467nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 486nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
468 struct nfsd4_commit *commit) 487 struct nfsd4_commit *commit)
469{ 488{
470 __be32 status;
471
472 u32 *p = (u32 *)commit->co_verf.data; 489 u32 *p = (u32 *)commit->co_verf.data;
473 *p++ = nfssvc_boot.tv_sec; 490 *p++ = nfssvc_boot.tv_sec;
474 *p++ = nfssvc_boot.tv_usec; 491 *p++ = nfssvc_boot.tv_usec;
475 492
476 status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, 493 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
477 commit->co_count); 494 commit->co_count);
478 if (status == nfserr_symlink)
479 status = nfserr_inval;
480 return status;
481} 495}
482 496
483static __be32 497static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
492 506
493 status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, 507 status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
494 NFSD_MAY_CREATE); 508 NFSD_MAY_CREATE);
495 if (status == nfserr_symlink)
496 status = nfserr_notdir;
497 if (status) 509 if (status)
498 return status; 510 return status;
499 511
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
691 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); 703 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
692 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); 704 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
693 705
694 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 706 if ((cookie == 1) || (cookie == 2) ||
695 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 707 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
696 return nfserr_bad_cookie; 708 return nfserr_bad_cookie;
697 709
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
719 return nfserr_grace; 731 return nfserr_grace;
720 status = nfsd_unlink(rqstp, &cstate->current_fh, 0, 732 status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
721 remove->rm_name, remove->rm_namelen); 733 remove->rm_name, remove->rm_namelen);
722 if (status == nfserr_symlink)
723 return nfserr_notdir;
724 if (!status) { 734 if (!status) {
725 fh_unlock(&cstate->current_fh); 735 fh_unlock(&cstate->current_fh);
726 set_change_info(&remove->rm_cinfo, &cstate->current_fh); 736 set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
751 (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && 761 (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
752 S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) 762 S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
753 status = nfserr_exist; 763 status = nfserr_exist;
754 else if (status == nfserr_symlink)
755 status = nfserr_notdir;
756 764
757 if (!status) { 765 if (!status) {
758 set_change_info(&rename->rn_sinfo, &cstate->current_fh); 766 set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
892 900
893 write->wr_bytes_written = cnt; 901 write->wr_bytes_written = cnt;
894 902
895 if (status == nfserr_symlink)
896 status = nfserr_inval;
897 return status; 903 return status;
898} 904}
899 905
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
930 count = 4 + (verify->ve_attrlen >> 2); 936 count = 4 + (verify->ve_attrlen >> 2);
931 buf = kmalloc(count << 2, GFP_KERNEL); 937 buf = kmalloc(count << 2, GFP_KERNEL);
932 if (!buf) 938 if (!buf)
933 return nfserr_resource; 939 return nfserr_jukebox;
934 940
935 status = nfsd4_encode_fattr(&cstate->current_fh, 941 status = nfsd4_encode_fattr(&cstate->current_fh,
936 cstate->current_fh.fh_export, 942 cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
994 1000
995typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 1001typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
996 void *); 1002 void *);
1003typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
1004
997enum nfsd4_op_flags { 1005enum nfsd4_op_flags {
998 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 1006 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
999 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ 1007 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
1001 /* For rfc 5661 section 2.6.3.1.1: */ 1009 /* For rfc 5661 section 2.6.3.1.1: */
1002 OP_HANDLES_WRONGSEC = 1 << 3, 1010 OP_HANDLES_WRONGSEC = 1 << 3,
1003 OP_IS_PUTFH_LIKE = 1 << 4, 1011 OP_IS_PUTFH_LIKE = 1 << 4,
1004};
1005
1006struct nfsd4_operation {
1007 nfsd4op_func op_func;
1008 u32 op_flags;
1009 char *op_name;
1010 /* 1012 /*
1013 * These are the ops whose result size we estimate before
1014 * encoding, to avoid performing an op then not being able to
1015 * respond or cache a response. This includes writes and setattrs
1016 * as well as the operations usually called "nonidempotent":
1017 */
1018 OP_MODIFIES_SOMETHING = 1 << 5,
1019 /*
1020 * Cache compounds containing these ops in the xid-based drc:
1011 * We use the DRC for compounds containing non-idempotent 1021 * We use the DRC for compounds containing non-idempotent
1012 * operations, *except* those that are 4.1-specific (since 1022 * operations, *except* those that are 4.1-specific (since
1013 * sessions provide their own EOS), and except for stateful 1023 * sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
1015 * (since sequence numbers provide EOS for open, lock, etc in 1025 * (since sequence numbers provide EOS for open, lock, etc in
1016 * the v4.0 case). 1026 * the v4.0 case).
1017 */ 1027 */
1018 bool op_cacheresult; 1028 OP_CACHEME = 1 << 6,
1029};
1030
1031struct nfsd4_operation {
1032 nfsd4op_func op_func;
1033 u32 op_flags;
1034 char *op_name;
1035 /* Try to get response size before operation */
1036 nfsd4op_rsize op_rsize_bop;
1019}; 1037};
1020 1038
1021static struct nfsd4_operation nfsd4_ops[]; 1039static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
1062 1080
1063bool nfsd4_cache_this_op(struct nfsd4_op *op) 1081bool nfsd4_cache_this_op(struct nfsd4_op *op)
1064{ 1082{
1065 return OPDESC(op)->op_cacheresult; 1083 return OPDESC(op)->op_flags & OP_CACHEME;
1066} 1084}
1067 1085
1068static bool need_wrongsec_check(struct svc_rqst *rqstp) 1086static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1110 struct nfsd4_operation *opdesc; 1128 struct nfsd4_operation *opdesc;
1111 struct nfsd4_compound_state *cstate = &resp->cstate; 1129 struct nfsd4_compound_state *cstate = &resp->cstate;
1112 int slack_bytes; 1130 int slack_bytes;
1131 u32 plen = 0;
1113 __be32 status; 1132 __be32 status;
1114 1133
1115 resp->xbuf = &rqstp->rq_res; 1134 resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1188 goto encode_op; 1207 goto encode_op;
1189 } 1208 }
1190 1209
1210 /* If op is non-idempotent */
1211 if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
1212 plen = opdesc->op_rsize_bop(rqstp, op);
1213 op->status = nfsd4_check_resp_size(resp, plen);
1214 }
1215
1216 if (op->status)
1217 goto encode_op;
1218
1191 if (opdesc->op_func) 1219 if (opdesc->op_func)
1192 op->status = opdesc->op_func(rqstp, cstate, &op->u); 1220 op->status = opdesc->op_func(rqstp, cstate, &op->u);
1193 else 1221 else
@@ -1217,7 +1245,7 @@ encode_op:
1217 be32_to_cpu(status)); 1245 be32_to_cpu(status));
1218 1246
1219 if (cstate->replay_owner) { 1247 if (cstate->replay_owner) {
1220 nfs4_put_stateowner(cstate->replay_owner); 1248 nfs4_unlock_state();
1221 cstate->replay_owner = NULL; 1249 cstate->replay_owner = NULL;
1222 } 1250 }
1223 /* XXX Ugh, we need to get rid of this kind of special case: */ 1251 /* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
1238 return status; 1266 return status;
1239} 1267}
1240 1268
1269#define op_encode_hdr_size (2)
1270#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
1271#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
1272#define op_encode_change_info_maxsz (5)
1273#define nfs4_fattr_bitmap_maxsz (4)
1274
1275#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
1276#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
1277
1278#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
1279
1280#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
1281#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
1282 op_encode_ace_maxsz)
1283
1284#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
1285
1286static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1287{
1288 return (op_encode_hdr_size) * sizeof(__be32);
1289}
1290
1291static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1292{
1293 return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
1294}
1295
1296static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1297{
1298 return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
1299}
1300
1301static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1302{
1303 return (op_encode_hdr_size + op_encode_change_info_maxsz
1304 + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
1305}
1306
1307static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1308{
1309 return (op_encode_hdr_size + op_encode_change_info_maxsz)
1310 * sizeof(__be32);
1311}
1312
1313static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1314{
1315 return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
1316 * sizeof(__be32);
1317}
1318
1319static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1320{
1321 return (op_encode_hdr_size + op_encode_stateid_maxsz
1322 + op_encode_change_info_maxsz + 1
1323 + nfs4_fattr_bitmap_maxsz
1324 + op_encode_delegation_maxsz) * sizeof(__be32);
1325}
1326
1327static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1328{
1329 u32 maxcount = 0, rlen = 0;
1330
1331 maxcount = svc_max_payload(rqstp);
1332 rlen = op->u.read.rd_length;
1333
1334 if (rlen > maxcount)
1335 rlen = maxcount;
1336
1337 return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
1338}
1339
1340static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1341{
1342 u32 rlen = op->u.readdir.rd_maxcount;
1343
1344 if (rlen > PAGE_SIZE)
1345 rlen = PAGE_SIZE;
1346
1347 return (op_encode_hdr_size + op_encode_verifier_maxsz)
1348 * sizeof(__be32) + rlen;
1349}
1350
1351static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1352{
1353 return (op_encode_hdr_size + op_encode_change_info_maxsz)
1354 * sizeof(__be32);
1355}
1356
1357static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1358{
1359 return (op_encode_hdr_size + op_encode_change_info_maxsz
1360 + op_encode_change_info_maxsz) * sizeof(__be32);
1361}
1362
1363static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1364{
1365 return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
1366}
1367
1368static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1369{
1370 return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
1371}
1372
1373static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1374{
1375 return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
1376}
1377
1378static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1379{
1380 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1381 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
1382 2 + /*eir_server_owner.so_minor_id */\
1383 /* eir_server_owner.so_major_id<> */\
1384 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
1385 /* eir_server_scope<> */\
1386 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
1387 1 + /* eir_server_impl_id array length */\
1388 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
1389}
1390
1391static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1392{
1393 return (op_encode_hdr_size + \
1394 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
1395 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
1396}
1397
1398static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1399{
1400 return (op_encode_hdr_size + \
1401 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
1402 2 + /* csr_sequence, csr_flags */\
1403 op_encode_channel_attrs_maxsz + \
1404 op_encode_channel_attrs_maxsz) * sizeof(__be32);
1405}
1406
1241static struct nfsd4_operation nfsd4_ops[] = { 1407static struct nfsd4_operation nfsd4_ops[] = {
1242 [OP_ACCESS] = { 1408 [OP_ACCESS] = {
1243 .op_func = (nfsd4op_func)nfsd4_access, 1409 .op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
1245 }, 1411 },
1246 [OP_CLOSE] = { 1412 [OP_CLOSE] = {
1247 .op_func = (nfsd4op_func)nfsd4_close, 1413 .op_func = (nfsd4op_func)nfsd4_close,
1414 .op_flags = OP_MODIFIES_SOMETHING,
1248 .op_name = "OP_CLOSE", 1415 .op_name = "OP_CLOSE",
1416 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1249 }, 1417 },
1250 [OP_COMMIT] = { 1418 [OP_COMMIT] = {
1251 .op_func = (nfsd4op_func)nfsd4_commit, 1419 .op_func = (nfsd4op_func)nfsd4_commit,
1420 .op_flags = OP_MODIFIES_SOMETHING,
1252 .op_name = "OP_COMMIT", 1421 .op_name = "OP_COMMIT",
1422 .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
1253 }, 1423 },
1254 [OP_CREATE] = { 1424 [OP_CREATE] = {
1255 .op_func = (nfsd4op_func)nfsd4_create, 1425 .op_func = (nfsd4op_func)nfsd4_create,
1426 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1256 .op_name = "OP_CREATE", 1427 .op_name = "OP_CREATE",
1257 .op_cacheresult = true, 1428 .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
1258 }, 1429 },
1259 [OP_DELEGRETURN] = { 1430 [OP_DELEGRETURN] = {
1260 .op_func = (nfsd4op_func)nfsd4_delegreturn, 1431 .op_func = (nfsd4op_func)nfsd4_delegreturn,
1432 .op_flags = OP_MODIFIES_SOMETHING,
1261 .op_name = "OP_DELEGRETURN", 1433 .op_name = "OP_DELEGRETURN",
1434 .op_rsize_bop = nfsd4_only_status_rsize,
1262 }, 1435 },
1263 [OP_GETATTR] = { 1436 [OP_GETATTR] = {
1264 .op_func = (nfsd4op_func)nfsd4_getattr, 1437 .op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
1271 }, 1444 },
1272 [OP_LINK] = { 1445 [OP_LINK] = {
1273 .op_func = (nfsd4op_func)nfsd4_link, 1446 .op_func = (nfsd4op_func)nfsd4_link,
1447 .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
1448 | OP_CACHEME,
1274 .op_name = "OP_LINK", 1449 .op_name = "OP_LINK",
1275 .op_cacheresult = true, 1450 .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
1276 }, 1451 },
1277 [OP_LOCK] = { 1452 [OP_LOCK] = {
1278 .op_func = (nfsd4op_func)nfsd4_lock, 1453 .op_func = (nfsd4op_func)nfsd4_lock,
1454 .op_flags = OP_MODIFIES_SOMETHING,
1279 .op_name = "OP_LOCK", 1455 .op_name = "OP_LOCK",
1456 .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
1280 }, 1457 },
1281 [OP_LOCKT] = { 1458 [OP_LOCKT] = {
1282 .op_func = (nfsd4op_func)nfsd4_lockt, 1459 .op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
1284 }, 1461 },
1285 [OP_LOCKU] = { 1462 [OP_LOCKU] = {
1286 .op_func = (nfsd4op_func)nfsd4_locku, 1463 .op_func = (nfsd4op_func)nfsd4_locku,
1464 .op_flags = OP_MODIFIES_SOMETHING,
1287 .op_name = "OP_LOCKU", 1465 .op_name = "OP_LOCKU",
1466 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1288 }, 1467 },
1289 [OP_LOOKUP] = { 1468 [OP_LOOKUP] = {
1290 .op_func = (nfsd4op_func)nfsd4_lookup, 1469 .op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
1302 }, 1481 },
1303 [OP_OPEN] = { 1482 [OP_OPEN] = {
1304 .op_func = (nfsd4op_func)nfsd4_open, 1483 .op_func = (nfsd4op_func)nfsd4_open,
1305 .op_flags = OP_HANDLES_WRONGSEC, 1484 .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
1306 .op_name = "OP_OPEN", 1485 .op_name = "OP_OPEN",
1486 .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
1307 }, 1487 },
1308 [OP_OPEN_CONFIRM] = { 1488 [OP_OPEN_CONFIRM] = {
1309 .op_func = (nfsd4op_func)nfsd4_open_confirm, 1489 .op_func = (nfsd4op_func)nfsd4_open_confirm,
1490 .op_flags = OP_MODIFIES_SOMETHING,
1310 .op_name = "OP_OPEN_CONFIRM", 1491 .op_name = "OP_OPEN_CONFIRM",
1492 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1311 }, 1493 },
1312 [OP_OPEN_DOWNGRADE] = { 1494 [OP_OPEN_DOWNGRADE] = {
1313 .op_func = (nfsd4op_func)nfsd4_open_downgrade, 1495 .op_func = (nfsd4op_func)nfsd4_open_downgrade,
1496 .op_flags = OP_MODIFIES_SOMETHING,
1314 .op_name = "OP_OPEN_DOWNGRADE", 1497 .op_name = "OP_OPEN_DOWNGRADE",
1498 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1315 }, 1499 },
1316 [OP_PUTFH] = { 1500 [OP_PUTFH] = {
1317 .op_func = (nfsd4op_func)nfsd4_putfh, 1501 .op_func = (nfsd4op_func)nfsd4_putfh,
1318 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1502 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1319 | OP_IS_PUTFH_LIKE, 1503 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1320 .op_name = "OP_PUTFH", 1504 .op_name = "OP_PUTFH",
1505 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1321 }, 1506 },
1322 [OP_PUTPUBFH] = { 1507 [OP_PUTPUBFH] = {
1323 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1508 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1324 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1509 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1325 | OP_IS_PUTFH_LIKE, 1510 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1326 .op_name = "OP_PUTPUBFH", 1511 .op_name = "OP_PUTPUBFH",
1512 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1327 }, 1513 },
1328 [OP_PUTROOTFH] = { 1514 [OP_PUTROOTFH] = {
1329 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1515 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1330 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1516 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1331 | OP_IS_PUTFH_LIKE, 1517 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1332 .op_name = "OP_PUTROOTFH", 1518 .op_name = "OP_PUTROOTFH",
1519 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1333 }, 1520 },
1334 [OP_READ] = { 1521 [OP_READ] = {
1335 .op_func = (nfsd4op_func)nfsd4_read, 1522 .op_func = (nfsd4op_func)nfsd4_read,
1523 .op_flags = OP_MODIFIES_SOMETHING,
1336 .op_name = "OP_READ", 1524 .op_name = "OP_READ",
1525 .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
1337 }, 1526 },
1338 [OP_READDIR] = { 1527 [OP_READDIR] = {
1339 .op_func = (nfsd4op_func)nfsd4_readdir, 1528 .op_func = (nfsd4op_func)nfsd4_readdir,
1529 .op_flags = OP_MODIFIES_SOMETHING,
1340 .op_name = "OP_READDIR", 1530 .op_name = "OP_READDIR",
1531 .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
1341 }, 1532 },
1342 [OP_READLINK] = { 1533 [OP_READLINK] = {
1343 .op_func = (nfsd4op_func)nfsd4_readlink, 1534 .op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
1345 }, 1536 },
1346 [OP_REMOVE] = { 1537 [OP_REMOVE] = {
1347 .op_func = (nfsd4op_func)nfsd4_remove, 1538 .op_func = (nfsd4op_func)nfsd4_remove,
1539 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1348 .op_name = "OP_REMOVE", 1540 .op_name = "OP_REMOVE",
1349 .op_cacheresult = true, 1541 .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
1350 }, 1542 },
1351 [OP_RENAME] = { 1543 [OP_RENAME] = {
1352 .op_name = "OP_RENAME",
1353 .op_func = (nfsd4op_func)nfsd4_rename, 1544 .op_func = (nfsd4op_func)nfsd4_rename,
1354 .op_cacheresult = true, 1545 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1546 .op_name = "OP_RENAME",
1547 .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
1355 }, 1548 },
1356 [OP_RENEW] = { 1549 [OP_RENEW] = {
1357 .op_func = (nfsd4op_func)nfsd4_renew, 1550 .op_func = (nfsd4op_func)nfsd4_renew,
1358 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1551 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1552 | OP_MODIFIES_SOMETHING,
1359 .op_name = "OP_RENEW", 1553 .op_name = "OP_RENEW",
1554 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1555
1360 }, 1556 },
1361 [OP_RESTOREFH] = { 1557 [OP_RESTOREFH] = {
1362 .op_func = (nfsd4op_func)nfsd4_restorefh, 1558 .op_func = (nfsd4op_func)nfsd4_restorefh,
1363 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1559 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1364 | OP_IS_PUTFH_LIKE, 1560 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1365 .op_name = "OP_RESTOREFH", 1561 .op_name = "OP_RESTOREFH",
1562 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1366 }, 1563 },
1367 [OP_SAVEFH] = { 1564 [OP_SAVEFH] = {
1368 .op_func = (nfsd4op_func)nfsd4_savefh, 1565 .op_func = (nfsd4op_func)nfsd4_savefh,
1369 .op_flags = OP_HANDLES_WRONGSEC, 1566 .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
1370 .op_name = "OP_SAVEFH", 1567 .op_name = "OP_SAVEFH",
1568 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1371 }, 1569 },
1372 [OP_SECINFO] = { 1570 [OP_SECINFO] = {
1373 .op_func = (nfsd4op_func)nfsd4_secinfo, 1571 .op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
1377 [OP_SETATTR] = { 1575 [OP_SETATTR] = {
1378 .op_func = (nfsd4op_func)nfsd4_setattr, 1576 .op_func = (nfsd4op_func)nfsd4_setattr,
1379 .op_name = "OP_SETATTR", 1577 .op_name = "OP_SETATTR",
1380 .op_cacheresult = true, 1578 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1579 .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
1381 }, 1580 },
1382 [OP_SETCLIENTID] = { 1581 [OP_SETCLIENTID] = {
1383 .op_func = (nfsd4op_func)nfsd4_setclientid, 1582 .op_func = (nfsd4op_func)nfsd4_setclientid,
1384 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1583 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1584 | OP_MODIFIES_SOMETHING | OP_CACHEME,
1385 .op_name = "OP_SETCLIENTID", 1585 .op_name = "OP_SETCLIENTID",
1386 .op_cacheresult = true, 1586 .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
1387 }, 1587 },
1388 [OP_SETCLIENTID_CONFIRM] = { 1588 [OP_SETCLIENTID_CONFIRM] = {
1389 .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, 1589 .op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
1390 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1590 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1591 | OP_MODIFIES_SOMETHING | OP_CACHEME,
1391 .op_name = "OP_SETCLIENTID_CONFIRM", 1592 .op_name = "OP_SETCLIENTID_CONFIRM",
1392 .op_cacheresult = true, 1593 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1393 }, 1594 },
1394 [OP_VERIFY] = { 1595 [OP_VERIFY] = {
1395 .op_func = (nfsd4op_func)nfsd4_verify, 1596 .op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
1397 }, 1598 },
1398 [OP_WRITE] = { 1599 [OP_WRITE] = {
1399 .op_func = (nfsd4op_func)nfsd4_write, 1600 .op_func = (nfsd4op_func)nfsd4_write,
1601 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1400 .op_name = "OP_WRITE", 1602 .op_name = "OP_WRITE",
1401 .op_cacheresult = true, 1603 .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
1402 }, 1604 },
1403 [OP_RELEASE_LOCKOWNER] = { 1605 [OP_RELEASE_LOCKOWNER] = {
1404 .op_func = (nfsd4op_func)nfsd4_release_lockowner, 1606 .op_func = (nfsd4op_func)nfsd4_release_lockowner,
1405 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1607 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1608 | OP_MODIFIES_SOMETHING,
1406 .op_name = "OP_RELEASE_LOCKOWNER", 1609 .op_name = "OP_RELEASE_LOCKOWNER",
1610 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1407 }, 1611 },
1408 1612
1409 /* NFSv4.1 operations */ 1613 /* NFSv4.1 operations */
1410 [OP_EXCHANGE_ID] = { 1614 [OP_EXCHANGE_ID] = {
1411 .op_func = (nfsd4op_func)nfsd4_exchange_id, 1615 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1412 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1616 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1617 | OP_MODIFIES_SOMETHING,
1413 .op_name = "OP_EXCHANGE_ID", 1618 .op_name = "OP_EXCHANGE_ID",
1619 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
1414 }, 1620 },
1415 [OP_BIND_CONN_TO_SESSION] = { 1621 [OP_BIND_CONN_TO_SESSION] = {
1416 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, 1622 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1417 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1623 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1624 | OP_MODIFIES_SOMETHING,
1418 .op_name = "OP_BIND_CONN_TO_SESSION", 1625 .op_name = "OP_BIND_CONN_TO_SESSION",
1626 .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
1419 }, 1627 },
1420 [OP_CREATE_SESSION] = { 1628 [OP_CREATE_SESSION] = {
1421 .op_func = (nfsd4op_func)nfsd4_create_session, 1629 .op_func = (nfsd4op_func)nfsd4_create_session,
1422 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1630 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1631 | OP_MODIFIES_SOMETHING,
1423 .op_name = "OP_CREATE_SESSION", 1632 .op_name = "OP_CREATE_SESSION",
1633 .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
1424 }, 1634 },
1425 [OP_DESTROY_SESSION] = { 1635 [OP_DESTROY_SESSION] = {
1426 .op_func = (nfsd4op_func)nfsd4_destroy_session, 1636 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1427 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1637 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1638 | OP_MODIFIES_SOMETHING,
1428 .op_name = "OP_DESTROY_SESSION", 1639 .op_name = "OP_DESTROY_SESSION",
1640 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1429 }, 1641 },
1430 [OP_SEQUENCE] = { 1642 [OP_SEQUENCE] = {
1431 .op_func = (nfsd4op_func)nfsd4_sequence, 1643 .op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
1433 .op_name = "OP_SEQUENCE", 1645 .op_name = "OP_SEQUENCE",
1434 }, 1646 },
1435 [OP_DESTROY_CLIENTID] = { 1647 [OP_DESTROY_CLIENTID] = {
1436 .op_func = NULL, 1648 .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
1437 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1649 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1650 | OP_MODIFIES_SOMETHING,
1438 .op_name = "OP_DESTROY_CLIENTID", 1651 .op_name = "OP_DESTROY_CLIENTID",
1652 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1439 }, 1653 },
1440 [OP_RECLAIM_COMPLETE] = { 1654 [OP_RECLAIM_COMPLETE] = {
1441 .op_func = (nfsd4op_func)nfsd4_reclaim_complete, 1655 .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
1442 .op_flags = ALLOWED_WITHOUT_FH, 1656 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1443 .op_name = "OP_RECLAIM_COMPLETE", 1657 .op_name = "OP_RECLAIM_COMPLETE",
1658 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1444 }, 1659 },
1445 [OP_SECINFO_NO_NAME] = { 1660 [OP_SECINFO_NO_NAME] = {
1446 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, 1661 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
1454 }, 1669 },
1455 [OP_FREE_STATEID] = { 1670 [OP_FREE_STATEID] = {
1456 .op_func = (nfsd4op_func)nfsd4_free_stateid, 1671 .op_func = (nfsd4op_func)nfsd4_free_stateid,
1457 .op_flags = ALLOWED_WITHOUT_FH, 1672 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1458 .op_name = "OP_FREE_STATEID", 1673 .op_name = "OP_FREE_STATEID",
1674 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1459 }, 1675 },
1460}; 1676};
1461 1677
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585..ed083b9a731 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
45 45
46/* Globals */ 46/* Globals */
47static struct file *rec_file; 47static struct file *rec_file;
48static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
48 49
49static int 50static int
50nfs4_save_creds(const struct cred **original_creds) 51nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
88 struct xdr_netobj cksum; 89 struct xdr_netobj cksum;
89 struct hash_desc desc; 90 struct hash_desc desc;
90 struct scatterlist sg; 91 struct scatterlist sg;
91 __be32 status = nfserr_resource; 92 __be32 status = nfserr_jukebox;
92 93
93 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 94 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
94 clname->len, clname->data); 95 clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
129 if (!rec_file || clp->cl_firststate) 130 if (!rec_file || clp->cl_firststate)
130 return 0; 131 return 0;
131 132
133 clp->cl_firststate = 1;
132 status = nfs4_save_creds(&original_cred); 134 status = nfs4_save_creds(&original_cred);
133 if (status < 0) 135 if (status < 0)
134 return status; 136 return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
143 goto out_unlock; 145 goto out_unlock;
144 } 146 }
145 status = -EEXIST; 147 status = -EEXIST;
146 if (dentry->d_inode) { 148 if (dentry->d_inode)
147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
148 goto out_put; 149 goto out_put;
149 }
150 status = mnt_want_write(rec_file->f_path.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
151 if (status) 151 if (status)
152 goto out_put; 152 goto out_put;
@@ -156,12 +156,14 @@ out_put:
156 dput(dentry); 156 dput(dentry);
157out_unlock: 157out_unlock:
158 mutex_unlock(&dir->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
159 if (status == 0) { 159 if (status == 0)
160 clp->cl_firststate = 1;
161 vfs_fsync(rec_file, 0); 160 vfs_fsync(rec_file, 0);
162 } 161 else
162 printk(KERN_ERR "NFSD: failed to write recovery record"
163 " (err %d); please check that %s exists"
164 " and is writeable", status,
165 user_recovery_dirname);
163 nfs4_reset_creds(original_cred); 166 nfs4_reset_creds(original_cred);
164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
165 return status; 167 return status;
166} 168}
167 169
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
354 */ 356 */
355 357
356void 358void
357nfsd4_init_recdir(char *rec_dirname) 359nfsd4_init_recdir()
358{ 360{
359 const struct cred *original_cred; 361 const struct cred *original_cred;
360 int status; 362 int status;
361 363
362 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 364 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
363 rec_dirname); 365 user_recovery_dirname);
364 366
365 BUG_ON(rec_file); 367 BUG_ON(rec_file);
366 368
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
372 return; 374 return;
373 } 375 }
374 376
375 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); 377 rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
376 if (IS_ERR(rec_file)) { 378 if (IS_ERR(rec_file)) {
377 printk("NFSD: unable to find recovery directory %s\n", 379 printk("NFSD: unable to find recovery directory %s\n",
378 rec_dirname); 380 user_recovery_dirname);
379 rec_file = NULL; 381 rec_file = NULL;
380 } 382 }
381 383
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
390 fput(rec_file); 392 fput(rec_file);
391 rec_file = NULL; 393 rec_file = NULL;
392} 394}
395
396/*
397 * Change the NFSv4 recovery directory to recdir.
398 */
399int
400nfs4_reset_recoverydir(char *recdir)
401{
402 int status;
403 struct path path;
404
405 status = kern_path(recdir, LOOKUP_FOLLOW, &path);
406 if (status)
407 return status;
408 status = -ENOTDIR;
409 if (S_ISDIR(path.dentry->d_inode->i_mode)) {
410 strcpy(user_recovery_dirname, recdir);
411 status = 0;
412 }
413 path_put(&path);
414 return status;
415}
416
417char *
418nfs4_recoverydir(void)
419{
420 return user_recovery_dirname;
421}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec11740..47e94e33a97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
49time_t nfsd4_lease = 90; /* default lease time */ 49time_t nfsd4_lease = 90; /* default lease time */
50time_t nfsd4_grace = 90; 50time_t nfsd4_grace = 90;
51static time_t boot_time; 51static time_t boot_time;
52static u32 current_ownerid = 1;
53static u32 current_fileid = 1;
54static u32 current_delegid = 1;
55static stateid_t zerostateid; /* bits all 0 */ 52static stateid_t zerostateid; /* bits all 0 */
56static stateid_t onestateid; /* bits all 1 */ 53static stateid_t onestateid; /* bits all 1 */
57static u64 current_sessionid = 1; 54static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
60#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 57#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
61 58
62/* forward declarations */ 59/* forward declarations */
63static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 60static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
64static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
65static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
66static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
67static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
68static void nfs4_set_recdir(char *recdir);
69static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
70 61
71/* Locking: */ 62/* Locking: */
72 63
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
80 */ 71 */
81static DEFINE_SPINLOCK(recall_lock); 72static DEFINE_SPINLOCK(recall_lock);
82 73
83static struct kmem_cache *stateowner_slab = NULL; 74static struct kmem_cache *openowner_slab = NULL;
75static struct kmem_cache *lockowner_slab = NULL;
84static struct kmem_cache *file_slab = NULL; 76static struct kmem_cache *file_slab = NULL;
85static struct kmem_cache *stateid_slab = NULL; 77static struct kmem_cache *stateid_slab = NULL;
86static struct kmem_cache *deleg_slab = NULL; 78static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
112 104
113static struct list_head del_recall_lru; 105static struct list_head del_recall_lru;
114 106
107static void nfsd4_free_file(struct nfs4_file *f)
108{
109 kmem_cache_free(file_slab, f);
110}
111
115static inline void 112static inline void
116put_nfs4_file(struct nfs4_file *fi) 113put_nfs4_file(struct nfs4_file *fi)
117{ 114{
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
119 list_del(&fi->fi_hash); 116 list_del(&fi->fi_hash);
120 spin_unlock(&recall_lock); 117 spin_unlock(&recall_lock);
121 iput(fi->fi_inode); 118 iput(fi->fi_inode);
122 kmem_cache_free(file_slab, fi); 119 nfsd4_free_file(fi);
123 } 120 }
124} 121}
125 122
@@ -136,35 +133,33 @@ unsigned int max_delegations;
136 * Open owner state (share locks) 133 * Open owner state (share locks)
137 */ 134 */
138 135
139/* hash tables for nfs4_stateowner */ 136/* hash tables for open owners */
140#define OWNER_HASH_BITS 8 137#define OPEN_OWNER_HASH_BITS 8
141#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) 138#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
142#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) 139#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
143 140
144#define ownerid_hashval(id) \ 141static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
145 ((id) & OWNER_HASH_MASK) 142{
146#define ownerstr_hashval(clientid, ownername) \ 143 unsigned int ret;
147 (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
148 144
149static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; 145 ret = opaque_hashval(ownername->data, ownername->len);
150static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; 146 ret += clientid;
147 return ret & OPEN_OWNER_HASH_MASK;
148}
149
150static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
151 151
152/* hash table for nfs4_file */ 152/* hash table for nfs4_file */
153#define FILE_HASH_BITS 8 153#define FILE_HASH_BITS 8
154#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 154#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
155 155
156/* hash table for (open)nfs4_stateid */ 156static unsigned int file_hashval(struct inode *ino)
157#define STATEID_HASH_BITS 10 157{
158#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) 158 /* XXX: why are we hashing on inode pointer, anyway? */
159#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) 159 return hash_ptr(ino, FILE_HASH_BITS);
160 160}
161#define file_hashval(x) \
162 hash_ptr(x, FILE_HASH_BITS)
163#define stateid_hashval(owner_id, file_id) \
164 (((owner_id) + (file_id)) & STATEID_HASH_MASK)
165 161
166static struct list_head file_hashtbl[FILE_HASH_SIZE]; 162static struct list_head file_hashtbl[FILE_HASH_SIZE];
167static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
168 163
169static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) 164static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
170{ 165{
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
192static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) 187static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
193{ 188{
194 if (atomic_dec_and_test(&fp->fi_access[oflag])) { 189 if (atomic_dec_and_test(&fp->fi_access[oflag])) {
195 nfs4_file_put_fd(fp, O_RDWR);
196 nfs4_file_put_fd(fp, oflag); 190 nfs4_file_put_fd(fp, oflag);
191 /*
192 * It's also safe to get rid of the RDWR open *if*
193 * we no longer have need of the other kind of access
194 * or if we already have the other kind of open:
195 */
196 if (fp->fi_fds[1-oflag]
197 || atomic_read(&fp->fi_access[1 - oflag]) == 0)
198 nfs4_file_put_fd(fp, O_RDWR);
197 } 199 }
198} 200}
199 201
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
206 __nfs4_file_put_access(fp, oflag); 208 __nfs4_file_put_access(fp, oflag);
207} 209}
208 210
211static inline int get_new_stid(struct nfs4_stid *stid)
212{
213 static int min_stateid = 0;
214 struct idr *stateids = &stid->sc_client->cl_stateids;
215 int new_stid;
216 int error;
217
218 error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
219 /*
220 * Note: the necessary preallocation was done in
221 * nfs4_alloc_stateid(). The idr code caps the number of
222 * preallocations that can exist at a time, but the state lock
223 * prevents anyone from using ours before we get here:
224 */
225 BUG_ON(error);
226 /*
227 * It shouldn't be a problem to reuse an opaque stateid value.
228 * I don't think it is for 4.1. But with 4.0 I worry that, for
229 * example, a stray write retransmission could be accepted by
230 * the server when it should have been rejected. Therefore,
231 * adopt a trick from the sctp code to attempt to maximize the
232 * amount of time until an id is reused, by ensuring they always
233 * "increase" (mod INT_MAX):
234 */
235
236 min_stateid = new_stid+1;
237 if (min_stateid == INT_MAX)
238 min_stateid = 0;
239 return new_stid;
240}
241
242static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
243{
244 stateid_t *s = &stid->sc_stateid;
245 int new_id;
246
247 stid->sc_type = type;
248 stid->sc_client = cl;
249 s->si_opaque.so_clid = cl->cl_clientid;
250 new_id = get_new_stid(stid);
251 s->si_opaque.so_id = (u32)new_id;
252 /* Will be incremented before return to client: */
253 s->si_generation = 0;
254}
255
256static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
257{
258 struct idr *stateids = &cl->cl_stateids;
259
260 if (!idr_pre_get(stateids, GFP_KERNEL))
261 return NULL;
262 /*
263 * Note: if we fail here (or any time between now and the time
264 * we actually get the new idr), we won't need to undo the idr
265 * preallocation, since the idr code caps the number of
266 * preallocated entries.
267 */
268 return kmem_cache_alloc(slab, GFP_KERNEL);
269}
270
271static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
272{
273 return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
274}
275
209static struct nfs4_delegation * 276static struct nfs4_delegation *
210alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) 277alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
211{ 278{
212 struct nfs4_delegation *dp; 279 struct nfs4_delegation *dp;
213 struct nfs4_file *fp = stp->st_file; 280 struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
224 return NULL; 291 return NULL;
225 if (num_delegations > max_delegations) 292 if (num_delegations > max_delegations)
226 return NULL; 293 return NULL;
227 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); 294 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
228 if (dp == NULL) 295 if (dp == NULL)
229 return dp; 296 return dp;
297 init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
298 /*
299 * delegation seqid's are never incremented. The 4.1 special
300 * meaning of seqid 0 isn't meaningful, really, but let's avoid
301 * 0 anyway just for consistency and use 1:
302 */
303 dp->dl_stid.sc_stateid.si_generation = 1;
230 num_delegations++; 304 num_delegations++;
231 INIT_LIST_HEAD(&dp->dl_perfile); 305 INIT_LIST_HEAD(&dp->dl_perfile);
232 INIT_LIST_HEAD(&dp->dl_perclnt); 306 INIT_LIST_HEAD(&dp->dl_perclnt);
233 INIT_LIST_HEAD(&dp->dl_recall_lru); 307 INIT_LIST_HEAD(&dp->dl_recall_lru);
234 dp->dl_client = clp;
235 get_nfs4_file(fp); 308 get_nfs4_file(fp);
236 dp->dl_file = fp; 309 dp->dl_file = fp;
237 dp->dl_type = type; 310 dp->dl_type = type;
238 dp->dl_stateid.si_boot = boot_time;
239 dp->dl_stateid.si_stateownerid = current_delegid++;
240 dp->dl_stateid.si_fileid = 0;
241 dp->dl_stateid.si_generation = 0;
242 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 311 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
243 dp->dl_time = 0; 312 dp->dl_time = 0;
244 atomic_set(&dp->dl_count, 1); 313 atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
267 } 336 }
268} 337}
269 338
339static void unhash_stid(struct nfs4_stid *s)
340{
341 struct idr *stateids = &s->sc_client->cl_stateids;
342
343 idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
344}
345
270/* Called under the state lock. */ 346/* Called under the state lock. */
271static void 347static void
272unhash_delegation(struct nfs4_delegation *dp) 348unhash_delegation(struct nfs4_delegation *dp)
273{ 349{
350 unhash_stid(&dp->dl_stid);
274 list_del_init(&dp->dl_perclnt); 351 list_del_init(&dp->dl_perclnt);
275 spin_lock(&recall_lock); 352 spin_lock(&recall_lock);
276 list_del_init(&dp->dl_perfile); 353 list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
292#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 369#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
293#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) 370#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
294 371
295#define clientid_hashval(id) \ 372static unsigned int clientid_hashval(u32 id)
296 ((id) & CLIENT_HASH_MASK) 373{
297#define clientstr_hashval(name) \ 374 return id & CLIENT_HASH_MASK;
298 (opaque_hashval((name), 8) & CLIENT_HASH_MASK) 375}
376
377static unsigned int clientstr_hashval(const char *name)
378{
379 return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
380}
381
299/* 382/*
300 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot 383 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
301 * used in reboot/reset lease grace period processing 384 * used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
362} 445}
363 446
364static int 447static int
365test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { 448test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
366 unsigned int access, deny; 449 unsigned int access, deny;
367 450
368 set_access(&access, stp->st_access_bmap); 451 set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
385 BUG(); 468 BUG();
386} 469}
387 470
388static void unhash_generic_stateid(struct nfs4_stateid *stp) 471static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
389{ 472{
390 list_del(&stp->st_hash);
391 list_del(&stp->st_perfile); 473 list_del(&stp->st_perfile);
392 list_del(&stp->st_perstateowner); 474 list_del(&stp->st_perstateowner);
393} 475}
394 476
395static void free_generic_stateid(struct nfs4_stateid *stp) 477static void close_generic_stateid(struct nfs4_ol_stateid *stp)
396{ 478{
397 int i; 479 int i;
398 480
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
401 if (test_bit(i, &stp->st_access_bmap)) 483 if (test_bit(i, &stp->st_access_bmap))
402 nfs4_file_put_access(stp->st_file, 484 nfs4_file_put_access(stp->st_file,
403 nfs4_access_to_omode(i)); 485 nfs4_access_to_omode(i));
486 __clear_bit(i, &stp->st_access_bmap);
404 } 487 }
405 } 488 }
406 put_nfs4_file(stp->st_file); 489 put_nfs4_file(stp->st_file);
490 stp->st_file = NULL;
491}
492
493static void free_generic_stateid(struct nfs4_ol_stateid *stp)
494{
407 kmem_cache_free(stateid_slab, stp); 495 kmem_cache_free(stateid_slab, stp);
408} 496}
409 497
410static void release_lock_stateid(struct nfs4_stateid *stp) 498static void release_lock_stateid(struct nfs4_ol_stateid *stp)
411{ 499{
412 struct file *file; 500 struct file *file;
413 501
414 unhash_generic_stateid(stp); 502 unhash_generic_stateid(stp);
503 unhash_stid(&stp->st_stid);
415 file = find_any_file(stp->st_file); 504 file = find_any_file(stp->st_file);
416 if (file) 505 if (file)
417 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); 506 locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
507 close_generic_stateid(stp);
418 free_generic_stateid(stp); 508 free_generic_stateid(stp);
419} 509}
420 510
421static void unhash_lockowner(struct nfs4_stateowner *sop) 511static void unhash_lockowner(struct nfs4_lockowner *lo)
422{ 512{
423 struct nfs4_stateid *stp; 513 struct nfs4_ol_stateid *stp;
424 514
425 list_del(&sop->so_idhash); 515 list_del(&lo->lo_owner.so_strhash);
426 list_del(&sop->so_strhash); 516 list_del(&lo->lo_perstateid);
427 list_del(&sop->so_perstateid); 517 while (!list_empty(&lo->lo_owner.so_stateids)) {
428 while (!list_empty(&sop->so_stateids)) { 518 stp = list_first_entry(&lo->lo_owner.so_stateids,
429 stp = list_first_entry(&sop->so_stateids, 519 struct nfs4_ol_stateid, st_perstateowner);
430 struct nfs4_stateid, st_perstateowner);
431 release_lock_stateid(stp); 520 release_lock_stateid(stp);
432 } 521 }
433} 522}
434 523
435static void release_lockowner(struct nfs4_stateowner *sop) 524static void release_lockowner(struct nfs4_lockowner *lo)
436{ 525{
437 unhash_lockowner(sop); 526 unhash_lockowner(lo);
438 nfs4_put_stateowner(sop); 527 nfs4_free_lockowner(lo);
439} 528}
440 529
441static void 530static void
442release_stateid_lockowners(struct nfs4_stateid *open_stp) 531release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
443{ 532{
444 struct nfs4_stateowner *lock_sop; 533 struct nfs4_lockowner *lo;
445 534
446 while (!list_empty(&open_stp->st_lockowners)) { 535 while (!list_empty(&open_stp->st_lockowners)) {
447 lock_sop = list_entry(open_stp->st_lockowners.next, 536 lo = list_entry(open_stp->st_lockowners.next,
448 struct nfs4_stateowner, so_perstateid); 537 struct nfs4_lockowner, lo_perstateid);
449 /* list_del(&open_stp->st_lockowners); */ 538 release_lockowner(lo);
450 BUG_ON(lock_sop->so_is_open_owner);
451 release_lockowner(lock_sop);
452 } 539 }
453} 540}
454 541
455static void release_open_stateid(struct nfs4_stateid *stp) 542static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
456{ 543{
457 unhash_generic_stateid(stp); 544 unhash_generic_stateid(stp);
458 release_stateid_lockowners(stp); 545 release_stateid_lockowners(stp);
546 close_generic_stateid(stp);
547}
548
549static void release_open_stateid(struct nfs4_ol_stateid *stp)
550{
551 unhash_open_stateid(stp);
552 unhash_stid(&stp->st_stid);
459 free_generic_stateid(stp); 553 free_generic_stateid(stp);
460} 554}
461 555
462static void unhash_openowner(struct nfs4_stateowner *sop) 556static void unhash_openowner(struct nfs4_openowner *oo)
463{ 557{
464 struct nfs4_stateid *stp; 558 struct nfs4_ol_stateid *stp;
465 559
466 list_del(&sop->so_idhash); 560 list_del(&oo->oo_owner.so_strhash);
467 list_del(&sop->so_strhash); 561 list_del(&oo->oo_perclient);
468 list_del(&sop->so_perclient); 562 while (!list_empty(&oo->oo_owner.so_stateids)) {
469 list_del(&sop->so_perstateid); /* XXX: necessary? */ 563 stp = list_first_entry(&oo->oo_owner.so_stateids,
470 while (!list_empty(&sop->so_stateids)) { 564 struct nfs4_ol_stateid, st_perstateowner);
471 stp = list_first_entry(&sop->so_stateids,
472 struct nfs4_stateid, st_perstateowner);
473 release_open_stateid(stp); 565 release_open_stateid(stp);
474 } 566 }
475} 567}
476 568
477static void release_openowner(struct nfs4_stateowner *sop) 569static void release_last_closed_stateid(struct nfs4_openowner *oo)
478{ 570{
479 unhash_openowner(sop); 571 struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
480 list_del(&sop->so_close_lru); 572
481 nfs4_put_stateowner(sop); 573 if (s) {
574 unhash_stid(&s->st_stid);
575 free_generic_stateid(s);
576 oo->oo_last_closed_stid = NULL;
577 }
578}
579
580static void release_openowner(struct nfs4_openowner *oo)
581{
582 unhash_openowner(oo);
583 list_del(&oo->oo_close_lru);
584 release_last_closed_stateid(oo);
585 nfs4_free_openowner(oo);
482} 586}
483 587
484#define SESSION_HASH_SIZE 512 588#define SESSION_HASH_SIZE 512
@@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp)
843 return; 947 return;
844 } 948 }
845 949
846 /*
847 * Move client to the end to the LRU list.
848 */
849 dprintk("renewing client (clientid %08x/%08x)\n", 950 dprintk("renewing client (clientid %08x/%08x)\n",
850 clp->cl_clientid.cl_boot, 951 clp->cl_clientid.cl_boot,
851 clp->cl_clientid.cl_id); 952 clp->cl_clientid.cl_id);
@@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp)
943static void 1044static void
944expire_client(struct nfs4_client *clp) 1045expire_client(struct nfs4_client *clp)
945{ 1046{
946 struct nfs4_stateowner *sop; 1047 struct nfs4_openowner *oo;
947 struct nfs4_delegation *dp; 1048 struct nfs4_delegation *dp;
948 struct list_head reaplist; 1049 struct list_head reaplist;
949 1050
@@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp)
961 unhash_delegation(dp); 1062 unhash_delegation(dp);
962 } 1063 }
963 while (!list_empty(&clp->cl_openowners)) { 1064 while (!list_empty(&clp->cl_openowners)) {
964 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 1065 oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
965 release_openowner(sop); 1066 release_openowner(oo);
966 } 1067 }
967 nfsd4_shutdown_callback(clp); 1068 nfsd4_shutdown_callback(clp);
968 if (clp->cl_cb_conn.cb_xprt) 1069 if (clp->cl_cb_conn.cb_xprt)
@@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp)
1038 *p++ = i++; 1139 *p++ = i++;
1039} 1140}
1040 1141
1142static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
1143{
1144 return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
1145}
1146
1147static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
1148{
1149 struct nfs4_stid *s;
1150
1151 s = find_stateid(cl, t);
1152 if (!s)
1153 return NULL;
1154 if (typemask & s->sc_type)
1155 return s;
1156 return NULL;
1157}
1158
1041static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, 1159static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1042 struct svc_rqst *rqstp, nfs4_verifier *verf) 1160 struct svc_rqst *rqstp, nfs4_verifier *verf)
1043{ 1161{
@@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1060 } 1178 }
1061 } 1179 }
1062 1180
1181 idr_init(&clp->cl_stateids);
1063 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1182 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1064 atomic_set(&clp->cl_refcount, 0); 1183 atomic_set(&clp->cl_refcount, 0);
1065 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 1184 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1083 return clp; 1202 return clp;
1084} 1203}
1085 1204
1086static int check_name(struct xdr_netobj name)
1087{
1088 if (name.len == 0)
1089 return 0;
1090 if (name.len > NFS4_OPAQUE_LIMIT) {
1091 dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
1092 return 0;
1093 }
1094 return 1;
1095}
1096
1097static void 1205static void
1098add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) 1206add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
1099{ 1207{
@@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid)
1125 unsigned int idhashval = clientid_hashval(clid->cl_id); 1233 unsigned int idhashval = clientid_hashval(clid->cl_id);
1126 1234
1127 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1235 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
1128 if (same_clid(&clp->cl_clientid, clid)) 1236 if (same_clid(&clp->cl_clientid, clid)) {
1237 renew_client(clp);
1129 return clp; 1238 return clp;
1239 }
1130 } 1240 }
1131 return NULL; 1241 return NULL;
1132} 1242}
@@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1173 return NULL; 1283 return NULL;
1174} 1284}
1175 1285
1176static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1177{
1178 switch (family) {
1179 case AF_INET:
1180 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1181 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1182 return;
1183 case AF_INET6:
1184 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1185 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1186 return;
1187 }
1188}
1189
1190static void 1286static void
1191gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) 1287gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1192{ 1288{
@@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
1218 1314
1219 conn->cb_prog = se->se_callback_prog; 1315 conn->cb_prog = se->se_callback_prog;
1220 conn->cb_ident = se->se_callback_ident; 1316 conn->cb_ident = se->se_callback_ident;
1221 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); 1317 memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
1222 return; 1318 return;
1223out_err: 1319out_err:
1224 conn->cb_addr.ss_family = AF_UNSPEC; 1320 conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1350 __func__, rqstp, exid, exid->clname.len, exid->clname.data, 1446 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1351 addr_str, exid->flags, exid->spa_how); 1447 addr_str, exid->flags, exid->spa_how);
1352 1448
1353 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) 1449 if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
1354 return nfserr_inval; 1450 return nfserr_inval;
1355 1451
1356 /* Currently only support SP4_NONE */ 1452 /* Currently only support SP4_NONE */
@@ -1849,8 +1945,16 @@ out:
1849 1945
1850 nfsd4_get_session(cstate->session); 1946 nfsd4_get_session(cstate->session);
1851 atomic_inc(&clp->cl_refcount); 1947 atomic_inc(&clp->cl_refcount);
1852 if (clp->cl_cb_state == NFSD4_CB_DOWN) 1948 switch (clp->cl_cb_state) {
1853 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; 1949 case NFSD4_CB_DOWN:
1950 seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
1951 break;
1952 case NFSD4_CB_FAULT:
1953 seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
1954 break;
1955 default:
1956 seq->status_flags = 0;
1957 }
1854 } 1958 }
1855 kfree(conn); 1959 kfree(conn);
1856 spin_unlock(&client_lock); 1960 spin_unlock(&client_lock);
@@ -1858,6 +1962,50 @@ out:
1858 return status; 1962 return status;
1859} 1963}
1860 1964
1965static inline bool has_resources(struct nfs4_client *clp)
1966{
1967 return !list_empty(&clp->cl_openowners)
1968 || !list_empty(&clp->cl_delegations)
1969 || !list_empty(&clp->cl_sessions);
1970}
1971
1972__be32
1973nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
1974{
1975 struct nfs4_client *conf, *unconf, *clp;
1976 int status = 0;
1977
1978 nfs4_lock_state();
1979 unconf = find_unconfirmed_client(&dc->clientid);
1980 conf = find_confirmed_client(&dc->clientid);
1981
1982 if (conf) {
1983 clp = conf;
1984
1985 if (!is_client_expired(conf) && has_resources(conf)) {
1986 status = nfserr_clientid_busy;
1987 goto out;
1988 }
1989
1990 /* rfc5661 18.50.3 */
1991 if (cstate->session && conf == cstate->session->se_client) {
1992 status = nfserr_clientid_busy;
1993 goto out;
1994 }
1995 } else if (unconf)
1996 clp = unconf;
1997 else {
1998 status = nfserr_stale_clientid;
1999 goto out;
2000 }
2001
2002 expire_client(clp);
2003out:
2004 nfs4_unlock_state();
2005 dprintk("%s return %d\n", __func__, ntohl(status));
2006 return status;
2007}
2008
1861__be32 2009__be32
1862nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) 2010nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1863{ 2011{
@@ -1900,19 +2048,13 @@ __be32
1900nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 2048nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1901 struct nfsd4_setclientid *setclid) 2049 struct nfsd4_setclientid *setclid)
1902{ 2050{
1903 struct xdr_netobj clname = { 2051 struct xdr_netobj clname = setclid->se_name;
1904 .len = setclid->se_namelen,
1905 .data = setclid->se_name,
1906 };
1907 nfs4_verifier clverifier = setclid->se_verf; 2052 nfs4_verifier clverifier = setclid->se_verf;
1908 unsigned int strhashval; 2053 unsigned int strhashval;
1909 struct nfs4_client *conf, *unconf, *new; 2054 struct nfs4_client *conf, *unconf, *new;
1910 __be32 status; 2055 __be32 status;
1911 char dname[HEXDIR_LEN]; 2056 char dname[HEXDIR_LEN];
1912 2057
1913 if (!check_name(clname))
1914 return nfserr_inval;
1915
1916 status = nfs4_make_rec_clidname(dname, &clname); 2058 status = nfs4_make_rec_clidname(dname, &clname);
1917 if (status) 2059 if (status)
1918 return status; 2060 return status;
@@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1946 * of 5 bullet points, labeled as CASE0 - CASE4 below. 2088 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1947 */ 2089 */
1948 unconf = find_unconfirmed_client_by_str(dname, strhashval); 2090 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1949 status = nfserr_resource; 2091 status = nfserr_jukebox;
1950 if (!conf) { 2092 if (!conf) {
1951 /* 2093 /*
1952 * RFC 3530 14.2.33 CASE 4: 2094 * RFC 3530 14.2.33 CASE 4:
@@ -2116,31 +2258,28 @@ out:
2116 return status; 2258 return status;
2117} 2259}
2118 2260
2261static struct nfs4_file *nfsd4_alloc_file(void)
2262{
2263 return kmem_cache_alloc(file_slab, GFP_KERNEL);
2264}
2265
2119/* OPEN Share state helper functions */ 2266/* OPEN Share state helper functions */
2120static inline struct nfs4_file * 2267static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
2121alloc_init_file(struct inode *ino)
2122{ 2268{
2123 struct nfs4_file *fp;
2124 unsigned int hashval = file_hashval(ino); 2269 unsigned int hashval = file_hashval(ino);
2125 2270
2126 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 2271 atomic_set(&fp->fi_ref, 1);
2127 if (fp) { 2272 INIT_LIST_HEAD(&fp->fi_hash);
2128 atomic_set(&fp->fi_ref, 1); 2273 INIT_LIST_HEAD(&fp->fi_stateids);
2129 INIT_LIST_HEAD(&fp->fi_hash); 2274 INIT_LIST_HEAD(&fp->fi_delegations);
2130 INIT_LIST_HEAD(&fp->fi_stateids); 2275 fp->fi_inode = igrab(ino);
2131 INIT_LIST_HEAD(&fp->fi_delegations); 2276 fp->fi_had_conflict = false;
2132 fp->fi_inode = igrab(ino); 2277 fp->fi_lease = NULL;
2133 fp->fi_id = current_fileid++; 2278 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2134 fp->fi_had_conflict = false; 2279 memset(fp->fi_access, 0, sizeof(fp->fi_access));
2135 fp->fi_lease = NULL; 2280 spin_lock(&recall_lock);
2136 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 2281 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
2137 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 2282 spin_unlock(&recall_lock);
2138 spin_lock(&recall_lock);
2139 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
2140 spin_unlock(&recall_lock);
2141 return fp;
2142 }
2143 return NULL;
2144} 2283}
2145 2284
2146static void 2285static void
@@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
2155void 2294void
2156nfsd4_free_slabs(void) 2295nfsd4_free_slabs(void)
2157{ 2296{
2158 nfsd4_free_slab(&stateowner_slab); 2297 nfsd4_free_slab(&openowner_slab);
2298 nfsd4_free_slab(&lockowner_slab);
2159 nfsd4_free_slab(&file_slab); 2299 nfsd4_free_slab(&file_slab);
2160 nfsd4_free_slab(&stateid_slab); 2300 nfsd4_free_slab(&stateid_slab);
2161 nfsd4_free_slab(&deleg_slab); 2301 nfsd4_free_slab(&deleg_slab);
@@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void)
2164static int 2304static int
2165nfsd4_init_slabs(void) 2305nfsd4_init_slabs(void)
2166{ 2306{
2167 stateowner_slab = kmem_cache_create("nfsd4_stateowners", 2307 openowner_slab = kmem_cache_create("nfsd4_openowners",
2168 sizeof(struct nfs4_stateowner), 0, 0, NULL); 2308 sizeof(struct nfs4_openowner), 0, 0, NULL);
2169 if (stateowner_slab == NULL) 2309 if (openowner_slab == NULL)
2310 goto out_nomem;
2311 lockowner_slab = kmem_cache_create("nfsd4_lockowners",
2312 sizeof(struct nfs4_openowner), 0, 0, NULL);
2313 if (lockowner_slab == NULL)
2170 goto out_nomem; 2314 goto out_nomem;
2171 file_slab = kmem_cache_create("nfsd4_files", 2315 file_slab = kmem_cache_create("nfsd4_files",
2172 sizeof(struct nfs4_file), 0, 0, NULL); 2316 sizeof(struct nfs4_file), 0, 0, NULL);
2173 if (file_slab == NULL) 2317 if (file_slab == NULL)
2174 goto out_nomem; 2318 goto out_nomem;
2175 stateid_slab = kmem_cache_create("nfsd4_stateids", 2319 stateid_slab = kmem_cache_create("nfsd4_stateids",
2176 sizeof(struct nfs4_stateid), 0, 0, NULL); 2320 sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
2177 if (stateid_slab == NULL) 2321 if (stateid_slab == NULL)
2178 goto out_nomem; 2322 goto out_nomem;
2179 deleg_slab = kmem_cache_create("nfsd4_delegations", 2323 deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2187,97 +2331,94 @@ out_nomem:
2187 return -ENOMEM; 2331 return -ENOMEM;
2188} 2332}
2189 2333
2190void 2334void nfs4_free_openowner(struct nfs4_openowner *oo)
2191nfs4_free_stateowner(struct kref *kref)
2192{ 2335{
2193 struct nfs4_stateowner *sop = 2336 kfree(oo->oo_owner.so_owner.data);
2194 container_of(kref, struct nfs4_stateowner, so_ref); 2337 kmem_cache_free(openowner_slab, oo);
2195 kfree(sop->so_owner.data);
2196 kmem_cache_free(stateowner_slab, sop);
2197} 2338}
2198 2339
2199static inline struct nfs4_stateowner * 2340void nfs4_free_lockowner(struct nfs4_lockowner *lo)
2200alloc_stateowner(struct xdr_netobj *owner)
2201{ 2341{
2202 struct nfs4_stateowner *sop; 2342 kfree(lo->lo_owner.so_owner.data);
2343 kmem_cache_free(lockowner_slab, lo);
2344}
2203 2345
2204 if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { 2346static void init_nfs4_replay(struct nfs4_replay *rp)
2205 if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { 2347{
2206 memcpy(sop->so_owner.data, owner->data, owner->len); 2348 rp->rp_status = nfserr_serverfault;
2207 sop->so_owner.len = owner->len; 2349 rp->rp_buflen = 0;
2208 kref_init(&sop->so_ref); 2350 rp->rp_buf = rp->rp_ibuf;
2209 return sop;
2210 }
2211 kmem_cache_free(stateowner_slab, sop);
2212 }
2213 return NULL;
2214} 2351}
2215 2352
2216static struct nfs4_stateowner * 2353static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
2217alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { 2354{
2218 struct nfs4_stateowner *sop; 2355 struct nfs4_stateowner *sop;
2219 struct nfs4_replay *rp;
2220 unsigned int idhashval;
2221 2356
2222 if (!(sop = alloc_stateowner(&open->op_owner))) 2357 sop = kmem_cache_alloc(slab, GFP_KERNEL);
2358 if (!sop)
2359 return NULL;
2360
2361 sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
2362 if (!sop->so_owner.data) {
2363 kmem_cache_free(slab, sop);
2223 return NULL; 2364 return NULL;
2224 idhashval = ownerid_hashval(current_ownerid); 2365 }
2225 INIT_LIST_HEAD(&sop->so_idhash); 2366 sop->so_owner.len = owner->len;
2226 INIT_LIST_HEAD(&sop->so_strhash); 2367
2227 INIT_LIST_HEAD(&sop->so_perclient);
2228 INIT_LIST_HEAD(&sop->so_stateids); 2368 INIT_LIST_HEAD(&sop->so_stateids);
2229 INIT_LIST_HEAD(&sop->so_perstateid); /* not used */
2230 INIT_LIST_HEAD(&sop->so_close_lru);
2231 sop->so_time = 0;
2232 list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
2233 list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
2234 list_add(&sop->so_perclient, &clp->cl_openowners);
2235 sop->so_is_open_owner = 1;
2236 sop->so_id = current_ownerid++;
2237 sop->so_client = clp; 2369 sop->so_client = clp;
2238 sop->so_seqid = open->op_seqid; 2370 init_nfs4_replay(&sop->so_replay);
2239 sop->so_confirmed = 0;
2240 rp = &sop->so_replay;
2241 rp->rp_status = nfserr_serverfault;
2242 rp->rp_buflen = 0;
2243 rp->rp_buf = rp->rp_ibuf;
2244 return sop; 2371 return sop;
2245} 2372}
2246 2373
2247static inline void 2374static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2248init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 2375{
2249 struct nfs4_stateowner *sop = open->op_stateowner; 2376 list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
2250 unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); 2377 list_add(&oo->oo_perclient, &clp->cl_openowners);
2378}
2251 2379
2252 INIT_LIST_HEAD(&stp->st_hash); 2380static struct nfs4_openowner *
2253 INIT_LIST_HEAD(&stp->st_perstateowner); 2381alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
2382 struct nfs4_openowner *oo;
2383
2384 oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
2385 if (!oo)
2386 return NULL;
2387 oo->oo_owner.so_is_open_owner = 1;
2388 oo->oo_owner.so_seqid = open->op_seqid;
2389 oo->oo_flags = NFS4_OO_NEW;
2390 oo->oo_time = 0;
2391 oo->oo_last_closed_stid = NULL;
2392 INIT_LIST_HEAD(&oo->oo_close_lru);
2393 hash_openowner(oo, clp, strhashval);
2394 return oo;
2395}
2396
2397static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
2398 struct nfs4_openowner *oo = open->op_openowner;
2399 struct nfs4_client *clp = oo->oo_owner.so_client;
2400
2401 init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
2254 INIT_LIST_HEAD(&stp->st_lockowners); 2402 INIT_LIST_HEAD(&stp->st_lockowners);
2255 INIT_LIST_HEAD(&stp->st_perfile); 2403 list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
2256 list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
2257 list_add(&stp->st_perstateowner, &sop->so_stateids);
2258 list_add(&stp->st_perfile, &fp->fi_stateids); 2404 list_add(&stp->st_perfile, &fp->fi_stateids);
2259 stp->st_stateowner = sop; 2405 stp->st_stateowner = &oo->oo_owner;
2260 get_nfs4_file(fp); 2406 get_nfs4_file(fp);
2261 stp->st_file = fp; 2407 stp->st_file = fp;
2262 stp->st_stateid.si_boot = boot_time;
2263 stp->st_stateid.si_stateownerid = sop->so_id;
2264 stp->st_stateid.si_fileid = fp->fi_id;
2265 stp->st_stateid.si_generation = 0;
2266 stp->st_access_bmap = 0; 2408 stp->st_access_bmap = 0;
2267 stp->st_deny_bmap = 0; 2409 stp->st_deny_bmap = 0;
2268 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, 2410 __set_bit(open->op_share_access, &stp->st_access_bmap);
2269 &stp->st_access_bmap);
2270 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2411 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2271 stp->st_openstp = NULL; 2412 stp->st_openstp = NULL;
2272} 2413}
2273 2414
2274static void 2415static void
2275move_to_close_lru(struct nfs4_stateowner *sop) 2416move_to_close_lru(struct nfs4_openowner *oo)
2276{ 2417{
2277 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 2418 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
2278 2419
2279 list_move_tail(&sop->so_close_lru, &close_lru); 2420 list_move_tail(&oo->oo_close_lru, &close_lru);
2280 sop->so_time = get_seconds(); 2421 oo->oo_time = get_seconds();
2281} 2422}
2282 2423
2283static int 2424static int
@@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2289 (sop->so_client->cl_clientid.cl_id == clid->cl_id); 2430 (sop->so_client->cl_clientid.cl_id == clid->cl_id);
2290} 2431}
2291 2432
2292static struct nfs4_stateowner * 2433static struct nfs4_openowner *
2293find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) 2434find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
2294{ 2435{
2295 struct nfs4_stateowner *so = NULL; 2436 struct nfs4_stateowner *so;
2437 struct nfs4_openowner *oo;
2296 2438
2297 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2439 list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
2298 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) 2440 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2299 return so; 2441 oo = openowner(so);
2442 renew_client(oo->oo_owner.so_client);
2443 return oo;
2444 }
2300 } 2445 }
2301 return NULL; 2446 return NULL;
2302} 2447}
@@ -2320,31 +2465,6 @@ find_file(struct inode *ino)
2320 return NULL; 2465 return NULL;
2321} 2466}
2322 2467
2323static inline int access_valid(u32 x, u32 minorversion)
2324{
2325 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
2326 return 0;
2327 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
2328 return 0;
2329 x &= ~NFS4_SHARE_ACCESS_MASK;
2330 if (minorversion && x) {
2331 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
2332 return 0;
2333 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
2334 return 0;
2335 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
2336 }
2337 if (x)
2338 return 0;
2339 return 1;
2340}
2341
2342static inline int deny_valid(u32 x)
2343{
2344 /* Note: unlike access bits, deny bits may be zero. */
2345 return x <= NFS4_SHARE_DENY_BOTH;
2346}
2347
2348/* 2468/*
2349 * Called to check deny when READ with all zero stateid or 2469 * Called to check deny when READ with all zero stateid or
2350 * WRITE with all zero or all one stateid 2470 * WRITE with all zero or all one stateid
@@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
2354{ 2474{
2355 struct inode *ino = current_fh->fh_dentry->d_inode; 2475 struct inode *ino = current_fh->fh_dentry->d_inode;
2356 struct nfs4_file *fp; 2476 struct nfs4_file *fp;
2357 struct nfs4_stateid *stp; 2477 struct nfs4_ol_stateid *stp;
2358 __be32 ret; 2478 __be32 ret;
2359 2479
2360 dprintk("NFSD: nfs4_share_conflict\n"); 2480 dprintk("NFSD: nfs4_share_conflict\n");
@@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
2429 .lm_change = nfsd_change_deleg_cb, 2549 .lm_change = nfsd_change_deleg_cb,
2430}; 2550};
2431 2551
2552static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
2553{
2554 if (nfsd4_has_session(cstate))
2555 return nfs_ok;
2556 if (seqid == so->so_seqid - 1)
2557 return nfserr_replay_me;
2558 if (seqid == so->so_seqid)
2559 return nfs_ok;
2560 return nfserr_bad_seqid;
2561}
2432 2562
2433__be32 2563__be32
2434nfsd4_process_open1(struct nfsd4_compound_state *cstate, 2564nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2437 clientid_t *clientid = &open->op_clientid; 2567 clientid_t *clientid = &open->op_clientid;
2438 struct nfs4_client *clp = NULL; 2568 struct nfs4_client *clp = NULL;
2439 unsigned int strhashval; 2569 unsigned int strhashval;
2440 struct nfs4_stateowner *sop = NULL; 2570 struct nfs4_openowner *oo = NULL;
2441 2571 __be32 status;
2442 if (!check_name(open->op_owner))
2443 return nfserr_inval;
2444 2572
2445 if (STALE_CLIENTID(&open->op_clientid)) 2573 if (STALE_CLIENTID(&open->op_clientid))
2446 return nfserr_stale_clientid; 2574 return nfserr_stale_clientid;
2575 /*
2576 * In case we need it later, after we've already created the
2577 * file and don't want to risk a further failure:
2578 */
2579 open->op_file = nfsd4_alloc_file();
2580 if (open->op_file == NULL)
2581 return nfserr_jukebox;
2447 2582
2448 strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); 2583 strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
2449 sop = find_openstateowner_str(strhashval, open); 2584 oo = find_openstateowner_str(strhashval, open);
2450 open->op_stateowner = sop; 2585 open->op_openowner = oo;
2451 if (!sop) { 2586 if (!oo) {
2452 /* Make sure the client's lease hasn't expired. */
2453 clp = find_confirmed_client(clientid); 2587 clp = find_confirmed_client(clientid);
2454 if (clp == NULL) 2588 if (clp == NULL)
2455 return nfserr_expired; 2589 return nfserr_expired;
2456 goto renew; 2590 goto new_owner;
2457 } 2591 }
2458 /* When sessions are used, skip open sequenceid processing */ 2592 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
2459 if (nfsd4_has_session(cstate))
2460 goto renew;
2461 if (!sop->so_confirmed) {
2462 /* Replace unconfirmed owners without checking for replay. */ 2593 /* Replace unconfirmed owners without checking for replay. */
2463 clp = sop->so_client; 2594 clp = oo->oo_owner.so_client;
2464 release_openowner(sop); 2595 release_openowner(oo);
2465 open->op_stateowner = NULL; 2596 open->op_openowner = NULL;
2466 goto renew; 2597 goto new_owner;
2467 }
2468 if (open->op_seqid == sop->so_seqid - 1) {
2469 if (sop->so_replay.rp_buflen)
2470 return nfserr_replay_me;
2471 /* The original OPEN failed so spectacularly
2472 * that we don't even have replay data saved!
2473 * Therefore, we have no choice but to continue
2474 * processing this OPEN; presumably, we'll
2475 * fail again for the same reason.
2476 */
2477 dprintk("nfsd4_process_open1: replay with no replay cache\n");
2478 goto renew;
2479 }
2480 if (open->op_seqid != sop->so_seqid)
2481 return nfserr_bad_seqid;
2482renew:
2483 if (open->op_stateowner == NULL) {
2484 sop = alloc_init_open_stateowner(strhashval, clp, open);
2485 if (sop == NULL)
2486 return nfserr_resource;
2487 open->op_stateowner = sop;
2488 } 2598 }
2489 list_del_init(&sop->so_close_lru); 2599 status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
2490 renew_client(sop->so_client); 2600 if (status)
2601 return status;
2602 clp = oo->oo_owner.so_client;
2603 goto alloc_stateid;
2604new_owner:
2605 oo = alloc_init_open_stateowner(strhashval, clp, open);
2606 if (oo == NULL)
2607 return nfserr_jukebox;
2608 open->op_openowner = oo;
2609alloc_stateid:
2610 open->op_stp = nfs4_alloc_stateid(clp);
2611 if (!open->op_stp)
2612 return nfserr_jukebox;
2491 return nfs_ok; 2613 return nfs_ok;
2492} 2614}
2493 2615
@@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
2500 return nfs_ok; 2622 return nfs_ok;
2501} 2623}
2502 2624
2503static struct nfs4_delegation * 2625static int share_access_to_flags(u32 share_access)
2504find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2505{ 2626{
2506 struct nfs4_delegation *dp; 2627 share_access &= ~NFS4_SHARE_WANT_MASK;
2507 2628
2508 spin_lock(&recall_lock); 2629 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
2509 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2510 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
2511 spin_unlock(&recall_lock);
2512 return dp;
2513 }
2514 spin_unlock(&recall_lock);
2515 return NULL;
2516} 2630}
2517 2631
2518static int share_access_to_flags(u32 share_access) 2632static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
2519{ 2633{
2520 share_access &= ~NFS4_SHARE_WANT_MASK; 2634 struct nfs4_stid *ret;
2521 2635
2522 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; 2636 ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
2637 if (!ret)
2638 return NULL;
2639 return delegstateid(ret);
2640}
2641
2642static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
2643{
2644 return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
2645 open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
2523} 2646}
2524 2647
2525static __be32 2648static __be32
2526nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 2649nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
2527 struct nfs4_delegation **dp) 2650 struct nfs4_delegation **dp)
2528{ 2651{
2529 int flags; 2652 int flags;
2530 __be32 status = nfserr_bad_stateid; 2653 __be32 status = nfserr_bad_stateid;
2531 2654
2532 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 2655 *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
2533 if (*dp == NULL) 2656 if (*dp == NULL)
2534 goto out; 2657 goto out;
2535 flags = share_access_to_flags(open->op_share_access); 2658 flags = share_access_to_flags(open->op_share_access);
@@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2537 if (status) 2660 if (status)
2538 *dp = NULL; 2661 *dp = NULL;
2539out: 2662out:
2540 if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) 2663 if (!nfsd4_is_deleg_cur(open))
2541 return nfs_ok; 2664 return nfs_ok;
2542 if (status) 2665 if (status)
2543 return status; 2666 return status;
2544 open->op_stateowner->so_confirmed = 1; 2667 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2545 return nfs_ok; 2668 return nfs_ok;
2546} 2669}
2547 2670
2548static __be32 2671static __be32
2549nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) 2672nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
2550{ 2673{
2551 struct nfs4_stateid *local; 2674 struct nfs4_ol_stateid *local;
2552 __be32 status = nfserr_share_denied; 2675 struct nfs4_openowner *oo = open->op_openowner;
2553 struct nfs4_stateowner *sop = open->op_stateowner;
2554 2676
2555 list_for_each_entry(local, &fp->fi_stateids, st_perfile) { 2677 list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
2556 /* ignore lock owners */ 2678 /* ignore lock owners */
2557 if (local->st_stateowner->so_is_open_owner == 0) 2679 if (local->st_stateowner->so_is_open_owner == 0)
2558 continue; 2680 continue;
2559 /* remember if we have seen this open owner */ 2681 /* remember if we have seen this open owner */
2560 if (local->st_stateowner == sop) 2682 if (local->st_stateowner == &oo->oo_owner)
2561 *stpp = local; 2683 *stpp = local;
2562 /* check for conflicting share reservations */ 2684 /* check for conflicting share reservations */
2563 if (!test_share(local, open)) 2685 if (!test_share(local, open))
2564 goto out; 2686 return nfserr_share_denied;
2565 } 2687 }
2566 status = 0; 2688 return nfs_ok;
2567out:
2568 return status;
2569} 2689}
2570 2690
2571static inline struct nfs4_stateid * 2691static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
2572nfs4_alloc_stateid(void)
2573{ 2692{
2574 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 2693 kmem_cache_free(stateid_slab, s);
2575} 2694}
2576 2695
2577static inline int nfs4_access_to_access(u32 nfs4_access) 2696static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
2592 int oflag = nfs4_access_to_omode(open->op_share_access); 2711 int oflag = nfs4_access_to_omode(open->op_share_access);
2593 int access = nfs4_access_to_access(open->op_share_access); 2712 int access = nfs4_access_to_access(open->op_share_access);
2594 2713
2595 /* CLAIM_DELEGATE_CUR is used in response to a broken lease;
2596 * allowing it to break the lease and return EAGAIN leaves the
2597 * client unable to make progress in returning the delegation */
2598 if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
2599 access |= NFSD_MAY_NOT_BREAK_LEASE;
2600
2601 if (!fp->fi_fds[oflag]) { 2714 if (!fp->fi_fds[oflag]) {
2602 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2715 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2603 &fp->fi_fds[oflag]); 2716 &fp->fi_fds[oflag]);
@@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
2609 return nfs_ok; 2722 return nfs_ok;
2610} 2723}
2611 2724
2612static __be32
2613nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
2614 struct nfs4_file *fp, struct svc_fh *cur_fh,
2615 struct nfsd4_open *open)
2616{
2617 struct nfs4_stateid *stp;
2618 __be32 status;
2619
2620 stp = nfs4_alloc_stateid();
2621 if (stp == NULL)
2622 return nfserr_resource;
2623
2624 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
2625 if (status) {
2626 kmem_cache_free(stateid_slab, stp);
2627 return status;
2628 }
2629 *stpp = stp;
2630 return 0;
2631}
2632
2633static inline __be32 2725static inline __be32
2634nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, 2726nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2635 struct nfsd4_open *open) 2727 struct nfsd4_open *open)
@@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2646} 2738}
2647 2739
2648static __be32 2740static __be32
2649nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2741nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
2650{ 2742{
2651 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; 2743 u32 op_share_access = open->op_share_access;
2652 bool new_access; 2744 bool new_access;
2653 __be32 status; 2745 __be32 status;
2654 2746
@@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2677static void 2769static void
2678nfs4_set_claim_prev(struct nfsd4_open *open) 2770nfs4_set_claim_prev(struct nfsd4_open *open)
2679{ 2771{
2680 open->op_stateowner->so_confirmed = 1; 2772 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2681 open->op_stateowner->so_client->cl_firststate = 1; 2773 open->op_openowner->oo_owner.so_client->cl_firststate = 1;
2682} 2774}
2683 2775
2684/* Should we give out recallable state?: */ 2776/* Should we give out recallable state?: */
@@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2721 if (!fl) 2813 if (!fl)
2722 return -ENOMEM; 2814 return -ENOMEM;
2723 fl->fl_file = find_readable_file(fp); 2815 fl->fl_file = find_readable_file(fp);
2724 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); 2816 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
2725 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); 2817 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
2726 if (status) { 2818 if (status) {
2727 list_del_init(&dp->dl_perclnt); 2819 list_del_init(&dp->dl_perclnt);
@@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2750 atomic_inc(&fp->fi_delegees); 2842 atomic_inc(&fp->fi_delegees);
2751 list_add(&dp->dl_perfile, &fp->fi_delegations); 2843 list_add(&dp->dl_perfile, &fp->fi_delegations);
2752 spin_unlock(&recall_lock); 2844 spin_unlock(&recall_lock);
2753 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); 2845 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
2754 return 0; 2846 return 0;
2755} 2847}
2756 2848
@@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2758 * Attempt to hand out a delegation. 2850 * Attempt to hand out a delegation.
2759 */ 2851 */
2760static void 2852static void
2761nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) 2853nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
2762{ 2854{
2763 struct nfs4_delegation *dp; 2855 struct nfs4_delegation *dp;
2764 struct nfs4_stateowner *sop = stp->st_stateowner; 2856 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
2765 int cb_up; 2857 int cb_up;
2766 int status, flag = 0; 2858 int status, flag = 0;
2767 2859
2768 cb_up = nfsd4_cb_channel_good(sop->so_client); 2860 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
2769 flag = NFS4_OPEN_DELEGATE_NONE; 2861 flag = NFS4_OPEN_DELEGATE_NONE;
2770 open->op_recall = 0; 2862 open->op_recall = 0;
2771 switch (open->op_claim_type) { 2863 switch (open->op_claim_type) {
@@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2781 * had the chance to reclaim theirs.... */ 2873 * had the chance to reclaim theirs.... */
2782 if (locks_in_grace()) 2874 if (locks_in_grace())
2783 goto out; 2875 goto out;
2784 if (!cb_up || !sop->so_confirmed) 2876 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
2785 goto out; 2877 goto out;
2786 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 2878 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2787 flag = NFS4_OPEN_DELEGATE_WRITE; 2879 flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2792 goto out; 2884 goto out;
2793 } 2885 }
2794 2886
2795 dp = alloc_init_deleg(sop->so_client, stp, fh, flag); 2887 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
2796 if (dp == NULL) 2888 if (dp == NULL)
2797 goto out_no_deleg; 2889 goto out_no_deleg;
2798 status = nfs4_set_delegation(dp, flag); 2890 status = nfs4_set_delegation(dp, flag);
2799 if (status) 2891 if (status)
2800 goto out_free; 2892 goto out_free;
2801 2893
2802 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); 2894 memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
2803 2895
2804 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 2896 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
2805 STATEID_VAL(&dp->dl_stateid)); 2897 STATEID_VAL(&dp->dl_stid.sc_stateid));
2806out: 2898out:
2807 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS 2899 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
2808 && flag == NFS4_OPEN_DELEGATE_NONE 2900 && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2824,16 +2916,13 @@ __be32
2824nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2916nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
2825{ 2917{
2826 struct nfsd4_compoundres *resp = rqstp->rq_resp; 2918 struct nfsd4_compoundres *resp = rqstp->rq_resp;
2919 struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
2827 struct nfs4_file *fp = NULL; 2920 struct nfs4_file *fp = NULL;
2828 struct inode *ino = current_fh->fh_dentry->d_inode; 2921 struct inode *ino = current_fh->fh_dentry->d_inode;
2829 struct nfs4_stateid *stp = NULL; 2922 struct nfs4_ol_stateid *stp = NULL;
2830 struct nfs4_delegation *dp = NULL; 2923 struct nfs4_delegation *dp = NULL;
2831 __be32 status; 2924 __be32 status;
2832 2925
2833 status = nfserr_inval;
2834 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
2835 || !deny_valid(open->op_share_deny))
2836 goto out;
2837 /* 2926 /*
2838 * Lookup file; if found, lookup stateid and check open request, 2927 * Lookup file; if found, lookup stateid and check open request,
2839 * and check for delegations in the process of being recalled. 2928 * and check for delegations in the process of being recalled.
@@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2843 if (fp) { 2932 if (fp) {
2844 if ((status = nfs4_check_open(fp, open, &stp))) 2933 if ((status = nfs4_check_open(fp, open, &stp)))
2845 goto out; 2934 goto out;
2846 status = nfs4_check_deleg(fp, open, &dp); 2935 status = nfs4_check_deleg(cl, fp, open, &dp);
2847 if (status) 2936 if (status)
2848 goto out; 2937 goto out;
2849 } else { 2938 } else {
2850 status = nfserr_bad_stateid; 2939 status = nfserr_bad_stateid;
2851 if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) 2940 if (nfsd4_is_deleg_cur(open))
2852 goto out;
2853 status = nfserr_resource;
2854 fp = alloc_init_file(ino);
2855 if (fp == NULL)
2856 goto out; 2941 goto out;
2942 status = nfserr_jukebox;
2943 fp = open->op_file;
2944 open->op_file = NULL;
2945 nfsd4_init_file(fp, ino);
2857 } 2946 }
2858 2947
2859 /* 2948 /*
@@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2865 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); 2954 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
2866 if (status) 2955 if (status)
2867 goto out; 2956 goto out;
2868 update_stateid(&stp->st_stateid);
2869 } else { 2957 } else {
2870 status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); 2958 status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
2871 if (status) 2959 if (status)
2872 goto out; 2960 goto out;
2873 init_stateid(stp, fp, open); 2961 stp = open->op_stp;
2962 open->op_stp = NULL;
2963 init_open_stateid(stp, fp, open);
2874 status = nfsd4_truncate(rqstp, current_fh, open); 2964 status = nfsd4_truncate(rqstp, current_fh, open);
2875 if (status) { 2965 if (status) {
2876 release_open_stateid(stp); 2966 release_open_stateid(stp);
2877 goto out; 2967 goto out;
2878 } 2968 }
2879 if (nfsd4_has_session(&resp->cstate))
2880 update_stateid(&stp->st_stateid);
2881 } 2969 }
2882 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2970 update_stateid(&stp->st_stid.sc_stateid);
2971 memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
2883 2972
2884 if (nfsd4_has_session(&resp->cstate)) 2973 if (nfsd4_has_session(&resp->cstate))
2885 open->op_stateowner->so_confirmed = 1; 2974 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2886 2975
2887 /* 2976 /*
2888 * Attempt to hand out a delegation. No error return, because the 2977 * Attempt to hand out a delegation. No error return, because the
@@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2893 status = nfs_ok; 2982 status = nfs_ok;
2894 2983
2895 dprintk("%s: stateid=" STATEID_FMT "\n", __func__, 2984 dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
2896 STATEID_VAL(&stp->st_stateid)); 2985 STATEID_VAL(&stp->st_stid.sc_stateid));
2897out: 2986out:
2898 if (fp) 2987 if (fp)
2899 put_nfs4_file(fp); 2988 put_nfs4_file(fp);
@@ -2903,13 +2992,34 @@ out:
2903 * To finish the open response, we just need to set the rflags. 2992 * To finish the open response, we just need to set the rflags.
2904 */ 2993 */
2905 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2994 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
2906 if (!open->op_stateowner->so_confirmed && 2995 if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
2907 !nfsd4_has_session(&resp->cstate)) 2996 !nfsd4_has_session(&resp->cstate))
2908 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2997 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
2909 2998
2910 return status; 2999 return status;
2911} 3000}
2912 3001
3002void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3003{
3004 if (open->op_openowner) {
3005 struct nfs4_openowner *oo = open->op_openowner;
3006
3007 if (!list_empty(&oo->oo_owner.so_stateids))
3008 list_del_init(&oo->oo_close_lru);
3009 if (oo->oo_flags & NFS4_OO_NEW) {
3010 if (status) {
3011 release_openowner(oo);
3012 open->op_openowner = NULL;
3013 } else
3014 oo->oo_flags &= ~NFS4_OO_NEW;
3015 }
3016 }
3017 if (open->op_file)
3018 nfsd4_free_file(open->op_file);
3019 if (open->op_stp)
3020 nfs4_free_stateid(open->op_stp);
3021}
3022
2913__be32 3023__be32
2914nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3024nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2915 clientid_t *clid) 3025 clientid_t *clid)
@@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2930 dprintk("nfsd4_renew: clientid not found!\n"); 3040 dprintk("nfsd4_renew: clientid not found!\n");
2931 goto out; 3041 goto out;
2932 } 3042 }
2933 renew_client(clp);
2934 status = nfserr_cb_path_down; 3043 status = nfserr_cb_path_down;
2935 if (!list_empty(&clp->cl_delegations) 3044 if (!list_empty(&clp->cl_delegations)
2936 && clp->cl_cb_state != NFSD4_CB_UP) 3045 && clp->cl_cb_state != NFSD4_CB_UP)
@@ -2962,7 +3071,7 @@ static time_t
2962nfs4_laundromat(void) 3071nfs4_laundromat(void)
2963{ 3072{
2964 struct nfs4_client *clp; 3073 struct nfs4_client *clp;
2965 struct nfs4_stateowner *sop; 3074 struct nfs4_openowner *oo;
2966 struct nfs4_delegation *dp; 3075 struct nfs4_delegation *dp;
2967 struct list_head *pos, *next, reaplist; 3076 struct list_head *pos, *next, reaplist;
2968 time_t cutoff = get_seconds() - nfsd4_lease; 3077 time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3019,16 +3128,14 @@ nfs4_laundromat(void)
3019 } 3128 }
3020 test_val = nfsd4_lease; 3129 test_val = nfsd4_lease;
3021 list_for_each_safe(pos, next, &close_lru) { 3130 list_for_each_safe(pos, next, &close_lru) {
3022 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); 3131 oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
3023 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { 3132 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
3024 u = sop->so_time - cutoff; 3133 u = oo->oo_time - cutoff;
3025 if (test_val > u) 3134 if (test_val > u)
3026 test_val = u; 3135 test_val = u;
3027 break; 3136 break;
3028 } 3137 }
3029 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 3138 release_openowner(oo);
3030 sop->so_id);
3031 release_openowner(sop);
3032 } 3139 }
3033 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 3140 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
3034 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 3141 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used)
3050 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); 3157 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
3051} 3158}
3052 3159
3053static struct nfs4_stateowner * 3160static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
3054search_close_lru(u32 st_id, int flags)
3055{ 3161{
3056 struct nfs4_stateowner *local = NULL; 3162 if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
3057 3163 return nfserr_bad_stateid;
3058 if (flags & CLOSE_STATE) { 3164 return nfs_ok;
3059 list_for_each_entry(local, &close_lru, so_close_lru) {
3060 if (local->so_id == st_id)
3061 return local;
3062 }
3063 }
3064 return NULL;
3065}
3066
3067static inline int
3068nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
3069{
3070 return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
3071} 3165}
3072 3166
3073static int 3167static int
3074STALE_STATEID(stateid_t *stateid) 3168STALE_STATEID(stateid_t *stateid)
3075{ 3169{
3076 if (stateid->si_boot == boot_time) 3170 if (stateid->si_opaque.so_clid.cl_boot == boot_time)
3077 return 0; 3171 return 0;
3078 dprintk("NFSD: stale stateid " STATEID_FMT "!\n", 3172 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
3079 STATEID_VAL(stateid)); 3173 STATEID_VAL(stateid));
@@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap)
3096} 3190}
3097 3191
3098static 3192static
3099__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) 3193__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
3100{ 3194{
3101 __be32 status = nfserr_openmode; 3195 __be32 status = nfserr_openmode;
3102 3196
@@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode)
3139 return locks_in_grace() && mandatory_lock(inode); 3233 return locks_in_grace() && mandatory_lock(inode);
3140} 3234}
3141 3235
3142static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) 3236/* Returns true iff a is later than b: */
3237static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3238{
3239 return (s32)a->si_generation - (s32)b->si_generation > 0;
3240}
3241
3242static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
3143{ 3243{
3144 /* 3244 /*
3145 * When sessions are used the stateid generation number is ignored 3245 * When sessions are used the stateid generation number is ignored
3146 * when it is zero. 3246 * when it is zero.
3147 */ 3247 */
3148 if ((flags & HAS_SESSION) && in->si_generation == 0) 3248 if (has_session && in->si_generation == 0)
3149 goto out; 3249 return nfs_ok;
3250
3251 if (in->si_generation == ref->si_generation)
3252 return nfs_ok;
3150 3253
3151 /* If the client sends us a stateid from the future, it's buggy: */ 3254 /* If the client sends us a stateid from the future, it's buggy: */
3152 if (in->si_generation > ref->si_generation) 3255 if (stateid_generation_after(in, ref))
3153 return nfserr_bad_stateid; 3256 return nfserr_bad_stateid;
3154 /* 3257 /*
3155 * The following, however, can happen. For example, if the 3258 * However, we could see a stateid from the past, even from a
3156 * client sends an open and some IO at the same time, the open 3259 * non-buggy client. For example, if the client sends a lock
3157 * may bump si_generation while the IO is still in flight. 3260 * while some IO is outstanding, the lock may bump si_generation
3158 * Thanks to hard links and renames, the client never knows what 3261 * while the IO is still in flight. The client could avoid that
3159 * file an open will affect. So it could avoid that situation 3262 * situation by waiting for responses on all the IO requests,
3160 * only by serializing all opens and IO from the same open 3263 * but better performance may result in retrying IO that
3161 * owner. To recover from the old_stateid error, the client 3264 * receives an old_stateid error if requests are rarely
3162 * will just have to retry the IO: 3265 * reordered in flight:
3163 */ 3266 */
3164 if (in->si_generation < ref->si_generation) 3267 return nfserr_old_stateid;
3165 return nfserr_old_stateid;
3166out:
3167 return nfs_ok;
3168} 3268}
3169 3269
3170static int is_delegation_stateid(stateid_t *stateid) 3270__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3171{ 3271{
3172 return stateid->si_fileid == 0; 3272 struct nfs4_stid *s;
3173} 3273 struct nfs4_ol_stateid *ols;
3274 __be32 status;
3174 3275
3175static int is_open_stateid(struct nfs4_stateid *stateid) 3276 if (STALE_STATEID(stateid))
3176{ 3277 return nfserr_stale_stateid;
3177 return stateid->st_openstp == NULL; 3278
3279 s = find_stateid(cl, stateid);
3280 if (!s)
3281 return nfserr_stale_stateid;
3282 status = check_stateid_generation(stateid, &s->sc_stateid, 1);
3283 if (status)
3284 return status;
3285 if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
3286 return nfs_ok;
3287 ols = openlockstateid(s);
3288 if (ols->st_stateowner->so_is_open_owner
3289 && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
3290 return nfserr_bad_stateid;
3291 return nfs_ok;
3178} 3292}
3179 3293
3180__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) 3294static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
3181{ 3295{
3182 struct nfs4_stateid *stp = NULL; 3296 struct nfs4_client *cl;
3183 __be32 status = nfserr_stale_stateid;
3184 3297
3298 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3299 return nfserr_bad_stateid;
3185 if (STALE_STATEID(stateid)) 3300 if (STALE_STATEID(stateid))
3186 goto out; 3301 return nfserr_stale_stateid;
3187 3302 cl = find_confirmed_client(&stateid->si_opaque.so_clid);
3188 status = nfserr_expired; 3303 if (!cl)
3189 stp = search_for_stateid(stateid); 3304 return nfserr_expired;
3190 if (!stp) 3305 *s = find_stateid_by_type(cl, stateid, typemask);
3191 goto out; 3306 if (!*s)
3192 status = nfserr_bad_stateid; 3307 return nfserr_bad_stateid;
3193 3308 return nfs_ok;
3194 if (!stp->st_stateowner->so_confirmed)
3195 goto out;
3196
3197 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
3198 if (status)
3199 goto out;
3200 3309
3201 status = nfs_ok;
3202out:
3203 return status;
3204} 3310}
3205 3311
3206/* 3312/*
@@ -3210,7 +3316,8 @@ __be32
3210nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, 3316nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3211 stateid_t *stateid, int flags, struct file **filpp) 3317 stateid_t *stateid, int flags, struct file **filpp)
3212{ 3318{
3213 struct nfs4_stateid *stp = NULL; 3319 struct nfs4_stid *s;
3320 struct nfs4_ol_stateid *stp = NULL;
3214 struct nfs4_delegation *dp = NULL; 3321 struct nfs4_delegation *dp = NULL;
3215 struct svc_fh *current_fh = &cstate->current_fh; 3322 struct svc_fh *current_fh = &cstate->current_fh;
3216 struct inode *ino = current_fh->fh_dentry->d_inode; 3323 struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3222 if (grace_disallows_io(ino)) 3329 if (grace_disallows_io(ino))
3223 return nfserr_grace; 3330 return nfserr_grace;
3224 3331
3225 if (nfsd4_has_session(cstate))
3226 flags |= HAS_SESSION;
3227
3228 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3332 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3229 return check_special_stateids(current_fh, stateid, flags); 3333 return check_special_stateids(current_fh, stateid, flags);
3230 3334
3231 status = nfserr_stale_stateid; 3335 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
3232 if (STALE_STATEID(stateid)) 3336 if (status)
3337 return status;
3338 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
3339 if (status)
3233 goto out; 3340 goto out;
3234 3341 switch (s->sc_type) {
3235 /* 3342 case NFS4_DELEG_STID:
3236 * We assume that any stateid that has the current boot time, 3343 dp = delegstateid(s);
3237 * but that we can't find, is expired:
3238 */
3239 status = nfserr_expired;
3240 if (is_delegation_stateid(stateid)) {
3241 dp = find_delegation_stateid(ino, stateid);
3242 if (!dp)
3243 goto out;
3244 status = check_stateid_generation(stateid, &dp->dl_stateid,
3245 flags);
3246 if (status)
3247 goto out;
3248 status = nfs4_check_delegmode(dp, flags); 3344 status = nfs4_check_delegmode(dp, flags);
3249 if (status) 3345 if (status)
3250 goto out; 3346 goto out;
3251 renew_client(dp->dl_client);
3252 if (filpp) { 3347 if (filpp) {
3253 *filpp = dp->dl_file->fi_deleg_file; 3348 *filpp = dp->dl_file->fi_deleg_file;
3254 BUG_ON(!*filpp); 3349 BUG_ON(!*filpp);
3255 } 3350 }
3256 } else { /* open or lock stateid */ 3351 break;
3257 stp = find_stateid(stateid, flags); 3352 case NFS4_OPEN_STID:
3258 if (!stp) 3353 case NFS4_LOCK_STID:
3259 goto out; 3354 stp = openlockstateid(s);
3260 status = nfserr_bad_stateid; 3355 status = nfs4_check_fh(current_fh, stp);
3261 if (nfs4_check_fh(current_fh, stp))
3262 goto out;
3263 if (!stp->st_stateowner->so_confirmed)
3264 goto out;
3265 status = check_stateid_generation(stateid, &stp->st_stateid,
3266 flags);
3267 if (status) 3356 if (status)
3268 goto out; 3357 goto out;
3358 if (stp->st_stateowner->so_is_open_owner
3359 && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
3360 goto out;
3269 status = nfs4_check_openmode(stp, flags); 3361 status = nfs4_check_openmode(stp, flags);
3270 if (status) 3362 if (status)
3271 goto out; 3363 goto out;
3272 renew_client(stp->st_stateowner->so_client);
3273 if (filpp) { 3364 if (filpp) {
3274 if (flags & RD_STATE) 3365 if (flags & RD_STATE)
3275 *filpp = find_readable_file(stp->st_file); 3366 *filpp = find_readable_file(stp->st_file);
3276 else 3367 else
3277 *filpp = find_writeable_file(stp->st_file); 3368 *filpp = find_writeable_file(stp->st_file);
3278 } 3369 }
3370 break;
3371 default:
3372 return nfserr_bad_stateid;
3279 } 3373 }
3280 status = nfs_ok; 3374 status = nfs_ok;
3281out: 3375out:
@@ -3283,18 +3377,9 @@ out:
3283} 3377}
3284 3378
3285static __be32 3379static __be32
3286nfsd4_free_delegation_stateid(stateid_t *stateid) 3380nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
3287{ 3381{
3288 struct nfs4_delegation *dp = search_for_delegation(stateid); 3382 if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
3289 if (dp)
3290 return nfserr_locks_held;
3291 return nfserr_bad_stateid;
3292}
3293
3294static __be32
3295nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
3296{
3297 if (check_for_locks(stp->st_file, stp->st_stateowner))
3298 return nfserr_locks_held; 3383 return nfserr_locks_held;
3299 release_lock_stateid(stp); 3384 release_lock_stateid(stp);
3300 return nfs_ok; 3385 return nfs_ok;
@@ -3307,51 +3392,40 @@ __be32
3307nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3392nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3308 struct nfsd4_test_stateid *test_stateid) 3393 struct nfsd4_test_stateid *test_stateid)
3309{ 3394{
3310 test_stateid->ts_has_session = nfsd4_has_session(cstate); 3395 /* real work is done during encoding */
3311 return nfs_ok; 3396 return nfs_ok;
3312} 3397}
3313 3398
3314/*
3315 * Free a state id
3316 */
3317__be32 3399__be32
3318nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3400nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3319 struct nfsd4_free_stateid *free_stateid) 3401 struct nfsd4_free_stateid *free_stateid)
3320{ 3402{
3321 stateid_t *stateid = &free_stateid->fr_stateid; 3403 stateid_t *stateid = &free_stateid->fr_stateid;
3322 struct nfs4_stateid *stp; 3404 struct nfs4_stid *s;
3323 __be32 ret; 3405 struct nfs4_client *cl = cstate->session->se_client;
3406 __be32 ret = nfserr_bad_stateid;
3324 3407
3325 nfs4_lock_state(); 3408 nfs4_lock_state();
3326 if (is_delegation_stateid(stateid)) { 3409 s = find_stateid(cl, stateid);
3327 ret = nfsd4_free_delegation_stateid(stateid); 3410 if (!s)
3328 goto out;
3329 }
3330
3331 stp = search_for_stateid(stateid);
3332 if (!stp) {
3333 ret = nfserr_bad_stateid;
3334 goto out; 3411 goto out;
3335 } 3412 switch (s->sc_type) {
3336 if (stateid->si_generation != 0) { 3413 case NFS4_DELEG_STID:
3337 if (stateid->si_generation < stp->st_stateid.si_generation) {
3338 ret = nfserr_old_stateid;
3339 goto out;
3340 }
3341 if (stateid->si_generation > stp->st_stateid.si_generation) {
3342 ret = nfserr_bad_stateid;
3343 goto out;
3344 }
3345 }
3346
3347 if (is_open_stateid(stp)) {
3348 ret = nfserr_locks_held; 3414 ret = nfserr_locks_held;
3349 goto out; 3415 goto out;
3350 } else { 3416 case NFS4_OPEN_STID:
3351 ret = nfsd4_free_lock_stateid(stp); 3417 case NFS4_LOCK_STID:
3352 goto out; 3418 ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
3419 if (ret)
3420 goto out;
3421 if (s->sc_type == NFS4_LOCK_STID)
3422 ret = nfsd4_free_lock_stateid(openlockstateid(s));
3423 else
3424 ret = nfserr_locks_held;
3425 break;
3426 default:
3427 ret = nfserr_bad_stateid;
3353 } 3428 }
3354
3355out: 3429out:
3356 nfs4_unlock_state(); 3430 nfs4_unlock_state();
3357 return ret; 3431 return ret;
@@ -3364,124 +3438,64 @@ setlkflg (int type)
3364 RD_STATE : WR_STATE; 3438 RD_STATE : WR_STATE;
3365} 3439}
3366 3440
3441static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
3442{
3443 struct svc_fh *current_fh = &cstate->current_fh;
3444 struct nfs4_stateowner *sop = stp->st_stateowner;
3445 __be32 status;
3446
3447 status = nfsd4_check_seqid(cstate, sop, seqid);
3448 if (status)
3449 return status;
3450 if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
3451 /*
3452 * "Closed" stateid's exist *only* to return
3453 * nfserr_replay_me from the previous step.
3454 */
3455 return nfserr_bad_stateid;
3456 status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
3457 if (status)
3458 return status;
3459 return nfs4_check_fh(current_fh, stp);
3460}
3461
3367/* 3462/*
3368 * Checks for sequence id mutating operations. 3463 * Checks for sequence id mutating operations.
3369 */ 3464 */
3370static __be32 3465static __be32
3371nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, 3466nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3372 stateid_t *stateid, int flags, 3467 stateid_t *stateid, char typemask,
3373 struct nfs4_stateowner **sopp, 3468 struct nfs4_ol_stateid **stpp)
3374 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
3375{ 3469{
3376 struct nfs4_stateid *stp;
3377 struct nfs4_stateowner *sop;
3378 struct svc_fh *current_fh = &cstate->current_fh;
3379 __be32 status; 3470 __be32 status;
3471 struct nfs4_stid *s;
3380 3472
3381 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, 3473 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
3382 seqid, STATEID_VAL(stateid)); 3474 seqid, STATEID_VAL(stateid));
3383 3475
3384 *stpp = NULL; 3476 *stpp = NULL;
3385 *sopp = NULL; 3477 status = nfsd4_lookup_stateid(stateid, typemask, &s);
3386 3478 if (status)
3387 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { 3479 return status;
3388 dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); 3480 *stpp = openlockstateid(s);
3389 return nfserr_bad_stateid; 3481 cstate->replay_owner = (*stpp)->st_stateowner;
3390 }
3391
3392 if (STALE_STATEID(stateid))
3393 return nfserr_stale_stateid;
3394
3395 if (nfsd4_has_session(cstate))
3396 flags |= HAS_SESSION;
3397
3398 /*
3399 * We return BAD_STATEID if filehandle doesn't match stateid,
3400 * the confirmed flag is incorrecly set, or the generation
3401 * number is incorrect.
3402 */
3403 stp = find_stateid(stateid, flags);
3404 if (stp == NULL) {
3405 /*
3406 * Also, we should make sure this isn't just the result of
3407 * a replayed close:
3408 */
3409 sop = search_close_lru(stateid->si_stateownerid, flags);
3410 /* It's not stale; let's assume it's expired: */
3411 if (sop == NULL)
3412 return nfserr_expired;
3413 *sopp = sop;
3414 goto check_replay;
3415 }
3416
3417 *stpp = stp;
3418 *sopp = sop = stp->st_stateowner;
3419
3420 if (lock) {
3421 clientid_t *lockclid = &lock->v.new.clientid;
3422 struct nfs4_client *clp = sop->so_client;
3423 int lkflg = 0;
3424 __be32 status;
3425
3426 lkflg = setlkflg(lock->lk_type);
3427
3428 if (lock->lk_is_new) {
3429 if (!sop->so_is_open_owner)
3430 return nfserr_bad_stateid;
3431 if (!(flags & HAS_SESSION) &&
3432 !same_clid(&clp->cl_clientid, lockclid))
3433 return nfserr_bad_stateid;
3434 /* stp is the open stateid */
3435 status = nfs4_check_openmode(stp, lkflg);
3436 if (status)
3437 return status;
3438 } else {
3439 /* stp is the lock stateid */
3440 status = nfs4_check_openmode(stp->st_openstp, lkflg);
3441 if (status)
3442 return status;
3443 }
3444 }
3445 3482
3446 if (nfs4_check_fh(current_fh, stp)) { 3483 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
3447 dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); 3484}
3448 return nfserr_bad_stateid;
3449 }
3450 3485
3451 /* 3486static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
3452 * We now validate the seqid and stateid generation numbers. 3487{
3453 * For the moment, we ignore the possibility of 3488 __be32 status;
3454 * generation number wraparound. 3489 struct nfs4_openowner *oo;
3455 */
3456 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
3457 goto check_replay;
3458 3490
3459 if (sop->so_confirmed && flags & CONFIRM) { 3491 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
3460 dprintk("NFSD: preprocess_seqid_op: expected" 3492 NFS4_OPEN_STID, stpp);
3461 " unconfirmed stateowner!\n");
3462 return nfserr_bad_stateid;
3463 }
3464 if (!sop->so_confirmed && !(flags & CONFIRM)) {
3465 dprintk("NFSD: preprocess_seqid_op: stateowner not"
3466 " confirmed yet!\n");
3467 return nfserr_bad_stateid;
3468 }
3469 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
3470 if (status) 3493 if (status)
3471 return status; 3494 return status;
3472 renew_client(sop->so_client); 3495 oo = openowner((*stpp)->st_stateowner);
3496 if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
3497 return nfserr_bad_stateid;
3473 return nfs_ok; 3498 return nfs_ok;
3474
3475check_replay:
3476 if (seqid == sop->so_seqid - 1) {
3477 dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
3478 /* indicate replay to calling function */
3479 return nfserr_replay_me;
3480 }
3481 dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
3482 sop->so_seqid, seqid);
3483 *sopp = NULL;
3484 return nfserr_bad_seqid;
3485} 3499}
3486 3500
3487__be32 3501__be32
@@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3489 struct nfsd4_open_confirm *oc) 3503 struct nfsd4_open_confirm *oc)
3490{ 3504{
3491 __be32 status; 3505 __be32 status;
3492 struct nfs4_stateowner *sop; 3506 struct nfs4_openowner *oo;
3493 struct nfs4_stateid *stp; 3507 struct nfs4_ol_stateid *stp;
3494 3508
3495 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", 3509 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
3496 (int)cstate->current_fh.fh_dentry->d_name.len, 3510 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3502 3516
3503 nfs4_lock_state(); 3517 nfs4_lock_state();
3504 3518
3505 if ((status = nfs4_preprocess_seqid_op(cstate, 3519 status = nfs4_preprocess_seqid_op(cstate,
3506 oc->oc_seqid, &oc->oc_req_stateid, 3520 oc->oc_seqid, &oc->oc_req_stateid,
3507 CONFIRM | OPEN_STATE, 3521 NFS4_OPEN_STID, &stp);
3508 &oc->oc_stateowner, &stp, NULL))) 3522 if (status)
3509 goto out; 3523 goto out;
3510 3524 oo = openowner(stp->st_stateowner);
3511 sop = oc->oc_stateowner; 3525 status = nfserr_bad_stateid;
3512 sop->so_confirmed = 1; 3526 if (oo->oo_flags & NFS4_OO_CONFIRMED)
3513 update_stateid(&stp->st_stateid); 3527 goto out;
3514 memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); 3528 oo->oo_flags |= NFS4_OO_CONFIRMED;
3529 update_stateid(&stp->st_stid.sc_stateid);
3530 memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3515 dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", 3531 dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
3516 __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); 3532 __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
3517 3533
3518 nfsd4_create_clid_dir(sop->so_client); 3534 nfsd4_create_clid_dir(oo->oo_owner.so_client);
3535 status = nfs_ok;
3519out: 3536out:
3520 if (oc->oc_stateowner) { 3537 if (!cstate->replay_owner)
3521 nfs4_get_stateowner(oc->oc_stateowner); 3538 nfs4_unlock_state();
3522 cstate->replay_owner = oc->oc_stateowner;
3523 }
3524 nfs4_unlock_state();
3525 return status; 3539 return status;
3526} 3540}
3527 3541
3528static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) 3542static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
3529{ 3543{
3530 int i; 3544 if (!test_bit(access, &stp->st_access_bmap))
3545 return;
3546 nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
3547 __clear_bit(access, &stp->st_access_bmap);
3548}
3531 3549
3532 for (i = 1; i < 4; i++) { 3550static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
3533 if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { 3551{
3534 nfs4_file_put_access(stp->st_file, i); 3552 switch (to_access) {
3535 __clear_bit(i, &stp->st_access_bmap); 3553 case NFS4_SHARE_ACCESS_READ:
3536 } 3554 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
3555 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
3556 break;
3557 case NFS4_SHARE_ACCESS_WRITE:
3558 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
3559 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
3560 break;
3561 case NFS4_SHARE_ACCESS_BOTH:
3562 break;
3563 default:
3564 BUG();
3537 } 3565 }
3538} 3566}
3539 3567
@@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3553 struct nfsd4_open_downgrade *od) 3581 struct nfsd4_open_downgrade *od)
3554{ 3582{
3555 __be32 status; 3583 __be32 status;
3556 struct nfs4_stateid *stp; 3584 struct nfs4_ol_stateid *stp;
3557 3585
3558 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 3586 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
3559 (int)cstate->current_fh.fh_dentry->d_name.len, 3587 (int)cstate->current_fh.fh_dentry->d_name.len,
3560 cstate->current_fh.fh_dentry->d_name.name); 3588 cstate->current_fh.fh_dentry->d_name.name);
3561 3589
3562 if (!access_valid(od->od_share_access, cstate->minorversion) 3590 /* We don't yet support WANT bits: */
3563 || !deny_valid(od->od_share_deny)) 3591 od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
3564 return nfserr_inval;
3565 3592
3566 nfs4_lock_state(); 3593 nfs4_lock_state();
3567 if ((status = nfs4_preprocess_seqid_op(cstate, 3594 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
3568 od->od_seqid, 3595 &od->od_stateid, &stp);
3569 &od->od_stateid, 3596 if (status)
3570 OPEN_STATE,
3571 &od->od_stateowner, &stp, NULL)))
3572 goto out; 3597 goto out;
3573
3574 status = nfserr_inval; 3598 status = nfserr_inval;
3575 if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { 3599 if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
3576 dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", 3600 dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3582 stp->st_deny_bmap, od->od_share_deny); 3606 stp->st_deny_bmap, od->od_share_deny);
3583 goto out; 3607 goto out;
3584 } 3608 }
3585 nfs4_file_downgrade(stp, od->od_share_access); 3609 nfs4_stateid_downgrade(stp, od->od_share_access);
3586 3610
3587 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3611 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
3588 3612
3589 update_stateid(&stp->st_stateid); 3613 update_stateid(&stp->st_stid.sc_stateid);
3590 memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); 3614 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3591 status = nfs_ok; 3615 status = nfs_ok;
3592out: 3616out:
3593 if (od->od_stateowner) { 3617 if (!cstate->replay_owner)
3594 nfs4_get_stateowner(od->od_stateowner); 3618 nfs4_unlock_state();
3595 cstate->replay_owner = od->od_stateowner;
3596 }
3597 nfs4_unlock_state();
3598 return status; 3619 return status;
3599} 3620}
3600 3621
3622void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
3623{
3624 struct nfs4_openowner *oo;
3625 struct nfs4_ol_stateid *s;
3626
3627 if (!so->so_is_open_owner)
3628 return;
3629 oo = openowner(so);
3630 s = oo->oo_last_closed_stid;
3631 if (!s)
3632 return;
3633 if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
3634 /* Release the last_closed_stid on the next seqid bump: */
3635 oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
3636 return;
3637 }
3638 oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
3639 release_last_closed_stateid(oo);
3640}
3641
3642static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
3643{
3644 unhash_open_stateid(s);
3645 s->st_stid.sc_type = NFS4_CLOSED_STID;
3646}
3647
3601/* 3648/*
3602 * nfs4_unlock_state() called after encode 3649 * nfs4_unlock_state() called after encode
3603 */ 3650 */
@@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3606 struct nfsd4_close *close) 3653 struct nfsd4_close *close)
3607{ 3654{
3608 __be32 status; 3655 __be32 status;
3609 struct nfs4_stateid *stp; 3656 struct nfs4_openowner *oo;
3657 struct nfs4_ol_stateid *stp;
3610 3658
3611 dprintk("NFSD: nfsd4_close on file %.*s\n", 3659 dprintk("NFSD: nfsd4_close on file %.*s\n",
3612 (int)cstate->current_fh.fh_dentry->d_name.len, 3660 (int)cstate->current_fh.fh_dentry->d_name.len,
3613 cstate->current_fh.fh_dentry->d_name.name); 3661 cstate->current_fh.fh_dentry->d_name.name);
3614 3662
3615 nfs4_lock_state(); 3663 nfs4_lock_state();
3616 /* check close_lru for replay */ 3664 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
3617 if ((status = nfs4_preprocess_seqid_op(cstate, 3665 &close->cl_stateid,
3618 close->cl_seqid, 3666 NFS4_OPEN_STID|NFS4_CLOSED_STID,
3619 &close->cl_stateid, 3667 &stp);
3620 OPEN_STATE | CLOSE_STATE, 3668 if (status)
3621 &close->cl_stateowner, &stp, NULL)))
3622 goto out; 3669 goto out;
3670 oo = openowner(stp->st_stateowner);
3623 status = nfs_ok; 3671 status = nfs_ok;
3624 update_stateid(&stp->st_stateid); 3672 update_stateid(&stp->st_stid.sc_stateid);
3625 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3673 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3626 3674
3627 /* release_stateid() calls nfsd_close() if needed */ 3675 nfsd4_close_open_stateid(stp);
3628 release_open_stateid(stp); 3676 oo->oo_last_closed_stid = stp;
3629 3677
3630 /* place unused nfs4_stateowners on so_close_lru list to be 3678 /* place unused nfs4_stateowners on so_close_lru list to be
3631 * released by the laundromat service after the lease period 3679 * released by the laundromat service after the lease period
3632 * to enable us to handle CLOSE replay 3680 * to enable us to handle CLOSE replay
3633 */ 3681 */
3634 if (list_empty(&close->cl_stateowner->so_stateids)) 3682 if (list_empty(&oo->oo_owner.so_stateids))
3635 move_to_close_lru(close->cl_stateowner); 3683 move_to_close_lru(oo);
3636out: 3684out:
3637 if (close->cl_stateowner) { 3685 if (!cstate->replay_owner)
3638 nfs4_get_stateowner(close->cl_stateowner); 3686 nfs4_unlock_state();
3639 cstate->replay_owner = close->cl_stateowner;
3640 }
3641 nfs4_unlock_state();
3642 return status; 3687 return status;
3643} 3688}
3644 3689
@@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3648{ 3693{
3649 struct nfs4_delegation *dp; 3694 struct nfs4_delegation *dp;
3650 stateid_t *stateid = &dr->dr_stateid; 3695 stateid_t *stateid = &dr->dr_stateid;
3696 struct nfs4_stid *s;
3651 struct inode *inode; 3697 struct inode *inode;
3652 __be32 status; 3698 __be32 status;
3653 int flags = 0;
3654 3699
3655 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3700 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
3656 return status; 3701 return status;
3657 inode = cstate->current_fh.fh_dentry->d_inode; 3702 inode = cstate->current_fh.fh_dentry->d_inode;
3658 3703
3659 if (nfsd4_has_session(cstate))
3660 flags |= HAS_SESSION;
3661 nfs4_lock_state(); 3704 nfs4_lock_state();
3662 status = nfserr_bad_stateid; 3705 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
3663 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3706 if (status)
3664 goto out;
3665 status = nfserr_stale_stateid;
3666 if (STALE_STATEID(stateid))
3667 goto out;
3668 status = nfserr_bad_stateid;
3669 if (!is_delegation_stateid(stateid))
3670 goto out;
3671 status = nfserr_expired;
3672 dp = find_delegation_stateid(inode, stateid);
3673 if (!dp)
3674 goto out; 3707 goto out;
3675 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3708 dp = delegstateid(s);
3709 status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
3676 if (status) 3710 if (status)
3677 goto out; 3711 goto out;
3678 renew_client(dp->dl_client);
3679 3712
3680 unhash_delegation(dp); 3713 unhash_delegation(dp);
3681out: 3714out:
@@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len)
3713 return end > start ? end - 1: NFS4_MAX_UINT64; 3746 return end > start ? end - 1: NFS4_MAX_UINT64;
3714} 3747}
3715 3748
3716#define lockownerid_hashval(id) \
3717 ((id) & LOCK_HASH_MASK)
3718
3719static inline unsigned int 3749static inline unsigned int
3720lock_ownerstr_hashval(struct inode *inode, u32 cl_id, 3750lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3721 struct xdr_netobj *ownername) 3751 struct xdr_netobj *ownername)
@@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3725 & LOCK_HASH_MASK; 3755 & LOCK_HASH_MASK;
3726} 3756}
3727 3757
3728static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
3729static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; 3758static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
3730static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
3731
3732static int
3733same_stateid(stateid_t *id_one, stateid_t *id_two)
3734{
3735 if (id_one->si_stateownerid != id_two->si_stateownerid)
3736 return 0;
3737 return id_one->si_fileid == id_two->si_fileid;
3738}
3739
3740static struct nfs4_stateid *
3741find_stateid(stateid_t *stid, int flags)
3742{
3743 struct nfs4_stateid *local;
3744 u32 st_id = stid->si_stateownerid;
3745 u32 f_id = stid->si_fileid;
3746 unsigned int hashval;
3747
3748 dprintk("NFSD: find_stateid flags 0x%x\n",flags);
3749 if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
3750 hashval = stateid_hashval(st_id, f_id);
3751 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
3752 if ((local->st_stateid.si_stateownerid == st_id) &&
3753 (local->st_stateid.si_fileid == f_id))
3754 return local;
3755 }
3756 }
3757
3758 if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
3759 hashval = stateid_hashval(st_id, f_id);
3760 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
3761 if ((local->st_stateid.si_stateownerid == st_id) &&
3762 (local->st_stateid.si_fileid == f_id))
3763 return local;
3764 }
3765 }
3766 return NULL;
3767}
3768
3769static struct nfs4_stateid *
3770search_for_stateid(stateid_t *stid)
3771{
3772 struct nfs4_stateid *local;
3773 unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
3774
3775 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
3776 if (same_stateid(&local->st_stateid, stid))
3777 return local;
3778 }
3779
3780 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
3781 if (same_stateid(&local->st_stateid, stid))
3782 return local;
3783 }
3784 return NULL;
3785}
3786
3787static struct nfs4_delegation *
3788search_for_delegation(stateid_t *stid)
3789{
3790 struct nfs4_file *fp;
3791 struct nfs4_delegation *dp;
3792 struct list_head *pos;
3793 int i;
3794
3795 for (i = 0; i < FILE_HASH_SIZE; i++) {
3796 list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
3797 list_for_each(pos, &fp->fi_delegations) {
3798 dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
3799 if (same_stateid(&dp->dl_stateid, stid))
3800 return dp;
3801 }
3802 }
3803 }
3804 return NULL;
3805}
3806
3807static struct nfs4_delegation *
3808find_delegation_stateid(struct inode *ino, stateid_t *stid)
3809{
3810 struct nfs4_file *fp;
3811 struct nfs4_delegation *dl;
3812
3813 dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
3814 STATEID_VAL(stid));
3815
3816 fp = find_file(ino);
3817 if (!fp)
3818 return NULL;
3819 dl = find_delegation_file(fp, stid);
3820 put_nfs4_file(fp);
3821 return dl;
3822}
3823 3759
3824/* 3760/*
3825 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3761 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = {
3846static inline void 3782static inline void
3847nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) 3783nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3848{ 3784{
3849 struct nfs4_stateowner *sop; 3785 struct nfs4_lockowner *lo;
3850 3786
3851 if (fl->fl_lmops == &nfsd_posix_mng_ops) { 3787 if (fl->fl_lmops == &nfsd_posix_mng_ops) {
3852 sop = (struct nfs4_stateowner *) fl->fl_owner; 3788 lo = (struct nfs4_lockowner *) fl->fl_owner;
3853 kref_get(&sop->so_ref); 3789 deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
3854 deny->ld_sop = sop; 3790 lo->lo_owner.so_owner.len, GFP_KERNEL);
3855 deny->ld_clientid = sop->so_client->cl_clientid; 3791 if (!deny->ld_owner.data)
3792 /* We just don't care that much */
3793 goto nevermind;
3794 deny->ld_owner.len = lo->lo_owner.so_owner.len;
3795 deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
3856 } else { 3796 } else {
3857 deny->ld_sop = NULL; 3797nevermind:
3798 deny->ld_owner.len = 0;
3799 deny->ld_owner.data = NULL;
3858 deny->ld_clientid.cl_boot = 0; 3800 deny->ld_clientid.cl_boot = 0;
3859 deny->ld_clientid.cl_id = 0; 3801 deny->ld_clientid.cl_id = 0;
3860 } 3802 }
@@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3867 deny->ld_type = NFS4_WRITE_LT; 3809 deny->ld_type = NFS4_WRITE_LT;
3868} 3810}
3869 3811
3870static struct nfs4_stateowner * 3812static struct nfs4_lockowner *
3871find_lockstateowner_str(struct inode *inode, clientid_t *clid, 3813find_lockowner_str(struct inode *inode, clientid_t *clid,
3872 struct xdr_netobj *owner) 3814 struct xdr_netobj *owner)
3873{ 3815{
3874 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); 3816 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
3876 3818
3877 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { 3819 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
3878 if (same_owner_str(op, owner, clid)) 3820 if (same_owner_str(op, owner, clid))
3879 return op; 3821 return lockowner(op);
3880 } 3822 }
3881 return NULL; 3823 return NULL;
3882} 3824}
3883 3825
3826static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
3827{
3828 list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
3829 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3830}
3831
3884/* 3832/*
3885 * Alloc a lock owner structure. 3833 * Alloc a lock owner structure.
3886 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 3834 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
3889 * strhashval = lock_ownerstr_hashval 3837 * strhashval = lock_ownerstr_hashval
3890 */ 3838 */
3891 3839
3892static struct nfs4_stateowner * 3840static struct nfs4_lockowner *
3893alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { 3841alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
3894 struct nfs4_stateowner *sop; 3842 struct nfs4_lockowner *lo;
3895 struct nfs4_replay *rp;
3896 unsigned int idhashval;
3897 3843
3898 if (!(sop = alloc_stateowner(&lock->lk_new_owner))) 3844 lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
3845 if (!lo)
3899 return NULL; 3846 return NULL;
3900 idhashval = lockownerid_hashval(current_ownerid); 3847 INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
3901 INIT_LIST_HEAD(&sop->so_idhash); 3848 lo->lo_owner.so_is_open_owner = 0;
3902 INIT_LIST_HEAD(&sop->so_strhash);
3903 INIT_LIST_HEAD(&sop->so_perclient);
3904 INIT_LIST_HEAD(&sop->so_stateids);
3905 INIT_LIST_HEAD(&sop->so_perstateid);
3906 INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
3907 sop->so_time = 0;
3908 list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
3909 list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
3910 list_add(&sop->so_perstateid, &open_stp->st_lockowners);
3911 sop->so_is_open_owner = 0;
3912 sop->so_id = current_ownerid++;
3913 sop->so_client = clp;
3914 /* It is the openowner seqid that will be incremented in encode in the 3849 /* It is the openowner seqid that will be incremented in encode in the
3915 * case of new lockowners; so increment the lock seqid manually: */ 3850 * case of new lockowners; so increment the lock seqid manually: */
3916 sop->so_seqid = lock->lk_new_lock_seqid + 1; 3851 lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
3917 sop->so_confirmed = 1; 3852 hash_lockowner(lo, strhashval, clp, open_stp);
3918 rp = &sop->so_replay; 3853 return lo;
3919 rp->rp_status = nfserr_serverfault;
3920 rp->rp_buflen = 0;
3921 rp->rp_buf = rp->rp_ibuf;
3922 return sop;
3923} 3854}
3924 3855
3925static struct nfs4_stateid * 3856static struct nfs4_ol_stateid *
3926alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) 3857alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
3927{ 3858{
3928 struct nfs4_stateid *stp; 3859 struct nfs4_ol_stateid *stp;
3929 unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); 3860 struct nfs4_client *clp = lo->lo_owner.so_client;
3930 3861
3931 stp = nfs4_alloc_stateid(); 3862 stp = nfs4_alloc_stateid(clp);
3932 if (stp == NULL) 3863 if (stp == NULL)
3933 goto out; 3864 return NULL;
3934 INIT_LIST_HEAD(&stp->st_hash); 3865 init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
3935 INIT_LIST_HEAD(&stp->st_perfile);
3936 INIT_LIST_HEAD(&stp->st_perstateowner);
3937 INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
3938 list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
3939 list_add(&stp->st_perfile, &fp->fi_stateids); 3866 list_add(&stp->st_perfile, &fp->fi_stateids);
3940 list_add(&stp->st_perstateowner, &sop->so_stateids); 3867 list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
3941 stp->st_stateowner = sop; 3868 stp->st_stateowner = &lo->lo_owner;
3942 get_nfs4_file(fp); 3869 get_nfs4_file(fp);
3943 stp->st_file = fp; 3870 stp->st_file = fp;
3944 stp->st_stateid.si_boot = boot_time;
3945 stp->st_stateid.si_stateownerid = sop->so_id;
3946 stp->st_stateid.si_fileid = fp->fi_id;
3947 stp->st_stateid.si_generation = 0;
3948 stp->st_access_bmap = 0; 3871 stp->st_access_bmap = 0;
3949 stp->st_deny_bmap = open_stp->st_deny_bmap; 3872 stp->st_deny_bmap = open_stp->st_deny_bmap;
3950 stp->st_openstp = open_stp; 3873 stp->st_openstp = open_stp;
3951
3952out:
3953 return stp; 3874 return stp;
3954} 3875}
3955 3876
@@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length)
3960 LOFF_OVERFLOW(offset, length))); 3881 LOFF_OVERFLOW(offset, length)));
3961} 3882}
3962 3883
3963static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) 3884static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
3964{ 3885{
3965 struct nfs4_file *fp = lock_stp->st_file; 3886 struct nfs4_file *fp = lock_stp->st_file;
3966 int oflag = nfs4_access_to_omode(access); 3887 int oflag = nfs4_access_to_omode(access);
@@ -3978,15 +3899,16 @@ __be32
3978nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3899nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3979 struct nfsd4_lock *lock) 3900 struct nfsd4_lock *lock)
3980{ 3901{
3981 struct nfs4_stateowner *open_sop = NULL; 3902 struct nfs4_openowner *open_sop = NULL;
3982 struct nfs4_stateowner *lock_sop = NULL; 3903 struct nfs4_lockowner *lock_sop = NULL;
3983 struct nfs4_stateid *lock_stp; 3904 struct nfs4_ol_stateid *lock_stp;
3984 struct nfs4_file *fp; 3905 struct nfs4_file *fp;
3985 struct file *filp = NULL; 3906 struct file *filp = NULL;
3986 struct file_lock file_lock; 3907 struct file_lock file_lock;
3987 struct file_lock conflock; 3908 struct file_lock conflock;
3988 __be32 status = 0; 3909 __be32 status = 0;
3989 unsigned int strhashval; 3910 unsigned int strhashval;
3911 int lkflg;
3990 int err; 3912 int err;
3991 3913
3992 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 3914 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4010 * Use open owner and open stateid to create lock owner and 3932 * Use open owner and open stateid to create lock owner and
4011 * lock stateid. 3933 * lock stateid.
4012 */ 3934 */
4013 struct nfs4_stateid *open_stp = NULL; 3935 struct nfs4_ol_stateid *open_stp = NULL;
4014 3936
4015 status = nfserr_stale_clientid; 3937 status = nfserr_stale_clientid;
4016 if (!nfsd4_has_session(cstate) && 3938 if (!nfsd4_has_session(cstate) &&
@@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4018 goto out; 3940 goto out;
4019 3941
4020 /* validate and update open stateid and open seqid */ 3942 /* validate and update open stateid and open seqid */
4021 status = nfs4_preprocess_seqid_op(cstate, 3943 status = nfs4_preprocess_confirmed_seqid_op(cstate,
4022 lock->lk_new_open_seqid, 3944 lock->lk_new_open_seqid,
4023 &lock->lk_new_open_stateid, 3945 &lock->lk_new_open_stateid,
4024 OPEN_STATE, 3946 &open_stp);
4025 &lock->lk_replay_owner, &open_stp,
4026 lock);
4027 if (status) 3947 if (status)
4028 goto out; 3948 goto out;
4029 open_sop = lock->lk_replay_owner; 3949 open_sop = openowner(open_stp->st_stateowner);
3950 status = nfserr_bad_stateid;
3951 if (!nfsd4_has_session(cstate) &&
3952 !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3953 &lock->v.new.clientid))
3954 goto out;
4030 /* create lockowner and lock stateid */ 3955 /* create lockowner and lock stateid */
4031 fp = open_stp->st_file; 3956 fp = open_stp->st_file;
4032 strhashval = lock_ownerstr_hashval(fp->fi_inode, 3957 strhashval = lock_ownerstr_hashval(fp->fi_inode,
4033 open_sop->so_client->cl_clientid.cl_id, 3958 open_sop->oo_owner.so_client->cl_clientid.cl_id,
4034 &lock->v.new.owner); 3959 &lock->v.new.owner);
4035 /* XXX: Do we need to check for duplicate stateowners on 3960 /* XXX: Do we need to check for duplicate stateowners on
4036 * the same file, or should they just be allowed (and 3961 * the same file, or should they just be allowed (and
4037 * create new stateids)? */ 3962 * create new stateids)? */
4038 status = nfserr_resource; 3963 status = nfserr_jukebox;
4039 lock_sop = alloc_init_lock_stateowner(strhashval, 3964 lock_sop = alloc_init_lock_stateowner(strhashval,
4040 open_sop->so_client, open_stp, lock); 3965 open_sop->oo_owner.so_client, open_stp, lock);
4041 if (lock_sop == NULL) 3966 if (lock_sop == NULL)
4042 goto out; 3967 goto out;
4043 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); 3968 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4046 } else { 3971 } else {
4047 /* lock (lock owner + lock stateid) already exists */ 3972 /* lock (lock owner + lock stateid) already exists */
4048 status = nfs4_preprocess_seqid_op(cstate, 3973 status = nfs4_preprocess_seqid_op(cstate,
4049 lock->lk_old_lock_seqid, 3974 lock->lk_old_lock_seqid,
4050 &lock->lk_old_lock_stateid, 3975 &lock->lk_old_lock_stateid,
4051 LOCK_STATE, 3976 NFS4_LOCK_STID, &lock_stp);
4052 &lock->lk_replay_owner, &lock_stp, lock);
4053 if (status) 3977 if (status)
4054 goto out; 3978 goto out;
4055 lock_sop = lock->lk_replay_owner; 3979 lock_sop = lockowner(lock_stp->st_stateowner);
4056 fp = lock_stp->st_file; 3980 fp = lock_stp->st_file;
4057 } 3981 }
4058 /* lock->lk_replay_owner and lock_stp have been created or found */ 3982 /* lock_sop and lock_stp have been created or found */
3983
3984 lkflg = setlkflg(lock->lk_type);
3985 status = nfs4_check_openmode(lock_stp, lkflg);
3986 if (status)
3987 goto out;
4059 3988
4060 status = nfserr_grace; 3989 status = nfserr_grace;
4061 if (locks_in_grace() && !lock->lk_reclaim) 3990 if (locks_in_grace() && !lock->lk_reclaim)
@@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4106 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); 4035 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
4107 switch (-err) { 4036 switch (-err) {
4108 case 0: /* success! */ 4037 case 0: /* success! */
4109 update_stateid(&lock_stp->st_stateid); 4038 update_stateid(&lock_stp->st_stid.sc_stateid);
4110 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 4039 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
4111 sizeof(stateid_t)); 4040 sizeof(stateid_t));
4112 status = 0; 4041 status = 0;
4113 break; 4042 break;
@@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4119 case (EDEADLK): 4048 case (EDEADLK):
4120 status = nfserr_deadlock; 4049 status = nfserr_deadlock;
4121 break; 4050 break;
4122 default: 4051 default:
4123 dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); 4052 dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
4124 status = nfserr_resource; 4053 status = nfserrno(err);
4125 break; 4054 break;
4126 } 4055 }
4127out: 4056out:
4128 if (status && lock->lk_is_new && lock_sop) 4057 if (status && lock->lk_is_new && lock_sop)
4129 release_lockowner(lock_sop); 4058 release_lockowner(lock_sop);
4130 if (lock->lk_replay_owner) { 4059 if (!cstate->replay_owner)
4131 nfs4_get_stateowner(lock->lk_replay_owner); 4060 nfs4_unlock_state();
4132 cstate->replay_owner = lock->lk_replay_owner;
4133 }
4134 nfs4_unlock_state();
4135 return status; 4061 return status;
4136} 4062}
4137 4063
@@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4163{ 4089{
4164 struct inode *inode; 4090 struct inode *inode;
4165 struct file_lock file_lock; 4091 struct file_lock file_lock;
4092 struct nfs4_lockowner *lo;
4166 int error; 4093 int error;
4167 __be32 status; 4094 __be32 status;
4168 4095
@@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4172 if (check_lock_length(lockt->lt_offset, lockt->lt_length)) 4099 if (check_lock_length(lockt->lt_offset, lockt->lt_length))
4173 return nfserr_inval; 4100 return nfserr_inval;
4174 4101
4175 lockt->lt_stateowner = NULL;
4176 nfs4_lock_state(); 4102 nfs4_lock_state();
4177 4103
4178 status = nfserr_stale_clientid; 4104 status = nfserr_stale_clientid;
4179 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) 4105 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
4180 goto out; 4106 goto out;
4181 4107
4182 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 4108 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4183 dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
4184 if (status == nfserr_symlink)
4185 status = nfserr_inval;
4186 goto out; 4109 goto out;
4187 }
4188 4110
4189 inode = cstate->current_fh.fh_dentry->d_inode; 4111 inode = cstate->current_fh.fh_dentry->d_inode;
4190 locks_init_lock(&file_lock); 4112 locks_init_lock(&file_lock);
@@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4203 goto out; 4125 goto out;
4204 } 4126 }
4205 4127
4206 lockt->lt_stateowner = find_lockstateowner_str(inode, 4128 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
4207 &lockt->lt_clientid, &lockt->lt_owner); 4129 if (lo)
4208 if (lockt->lt_stateowner) 4130 file_lock.fl_owner = (fl_owner_t)lo;
4209 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
4210 file_lock.fl_pid = current->tgid; 4131 file_lock.fl_pid = current->tgid;
4211 file_lock.fl_flags = FL_POSIX; 4132 file_lock.fl_flags = FL_POSIX;
4212 4133
@@ -4234,7 +4155,7 @@ __be32
4234nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4155nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4235 struct nfsd4_locku *locku) 4156 struct nfsd4_locku *locku)
4236{ 4157{
4237 struct nfs4_stateid *stp; 4158 struct nfs4_ol_stateid *stp;
4238 struct file *filp = NULL; 4159 struct file *filp = NULL;
4239 struct file_lock file_lock; 4160 struct file_lock file_lock;
4240 __be32 status; 4161 __be32 status;
@@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4249 4170
4250 nfs4_lock_state(); 4171 nfs4_lock_state();
4251 4172
4252 if ((status = nfs4_preprocess_seqid_op(cstate, 4173 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
4253 locku->lu_seqid, 4174 &locku->lu_stateid, NFS4_LOCK_STID, &stp);
4254 &locku->lu_stateid, 4175 if (status)
4255 LOCK_STATE,
4256 &locku->lu_stateowner, &stp, NULL)))
4257 goto out; 4176 goto out;
4258
4259 filp = find_any_file(stp->st_file); 4177 filp = find_any_file(stp->st_file);
4260 if (!filp) { 4178 if (!filp) {
4261 status = nfserr_lock_range; 4179 status = nfserr_lock_range;
@@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4264 BUG_ON(!filp); 4182 BUG_ON(!filp);
4265 locks_init_lock(&file_lock); 4183 locks_init_lock(&file_lock);
4266 file_lock.fl_type = F_UNLCK; 4184 file_lock.fl_type = F_UNLCK;
4267 file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; 4185 file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4268 file_lock.fl_pid = current->tgid; 4186 file_lock.fl_pid = current->tgid;
4269 file_lock.fl_file = filp; 4187 file_lock.fl_file = filp;
4270 file_lock.fl_flags = FL_POSIX; 4188 file_lock.fl_flags = FL_POSIX;
@@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4285 /* 4203 /*
4286 * OK, unlock succeeded; the only thing left to do is update the stateid. 4204 * OK, unlock succeeded; the only thing left to do is update the stateid.
4287 */ 4205 */
4288 update_stateid(&stp->st_stateid); 4206 update_stateid(&stp->st_stid.sc_stateid);
4289 memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); 4207 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4290 4208
4291out: 4209out:
4292 if (locku->lu_stateowner) { 4210 if (!cstate->replay_owner)
4293 nfs4_get_stateowner(locku->lu_stateowner); 4211 nfs4_unlock_state();
4294 cstate->replay_owner = locku->lu_stateowner;
4295 }
4296 nfs4_unlock_state();
4297 return status; 4212 return status;
4298 4213
4299out_nfserr: 4214out_nfserr:
@@ -4307,7 +4222,7 @@ out_nfserr:
4307 * 0: no locks held by lockowner 4222 * 0: no locks held by lockowner
4308 */ 4223 */
4309static int 4224static int
4310check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) 4225check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4311{ 4226{
4312 struct file_lock **flpp; 4227 struct file_lock **flpp;
4313 struct inode *inode = filp->fi_inode; 4228 struct inode *inode = filp->fi_inode;
@@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4332{ 4247{
4333 clientid_t *clid = &rlockowner->rl_clientid; 4248 clientid_t *clid = &rlockowner->rl_clientid;
4334 struct nfs4_stateowner *sop; 4249 struct nfs4_stateowner *sop;
4335 struct nfs4_stateid *stp; 4250 struct nfs4_lockowner *lo;
4251 struct nfs4_ol_stateid *stp;
4336 struct xdr_netobj *owner = &rlockowner->rl_owner; 4252 struct xdr_netobj *owner = &rlockowner->rl_owner;
4337 struct list_head matches; 4253 struct list_head matches;
4338 int i; 4254 int i;
@@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4356 * data structures. */ 4272 * data structures. */
4357 INIT_LIST_HEAD(&matches); 4273 INIT_LIST_HEAD(&matches);
4358 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4274 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4359 list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { 4275 list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
4360 if (!same_owner_str(sop, owner, clid)) 4276 if (!same_owner_str(sop, owner, clid))
4361 continue; 4277 continue;
4362 list_for_each_entry(stp, &sop->so_stateids, 4278 list_for_each_entry(stp, &sop->so_stateids,
4363 st_perstateowner) { 4279 st_perstateowner) {
4364 if (check_for_locks(stp->st_file, sop)) 4280 lo = lockowner(sop);
4281 if (check_for_locks(stp->st_file, lo))
4365 goto out; 4282 goto out;
4366 /* Note: so_perclient unused for lockowners, 4283 list_add(&lo->lo_list, &matches);
4367 * so it's OK to fool with here. */
4368 list_add(&sop->so_perclient, &matches);
4369 } 4284 }
4370 } 4285 }
4371 } 4286 }
@@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4374 * have been checked. */ 4289 * have been checked. */
4375 status = nfs_ok; 4290 status = nfs_ok;
4376 while (!list_empty(&matches)) { 4291 while (!list_empty(&matches)) {
4377 sop = list_entry(matches.next, struct nfs4_stateowner, 4292 lo = list_entry(matches.next, struct nfs4_lockowner,
4378 so_perclient); 4293 lo_list);
4379 /* unhash_stateowner deletes so_perclient only 4294 /* unhash_stateowner deletes so_perclient only
4380 * for openowners. */ 4295 * for openowners. */
4381 list_del(&sop->so_perclient); 4296 list_del(&lo->lo_list);
4382 release_lockowner(sop); 4297 release_lockowner(lo);
4383 } 4298 }
4384out: 4299out:
4385 nfs4_unlock_state(); 4300 nfs4_unlock_state();
@@ -4501,16 +4416,10 @@ nfs4_state_init(void)
4501 for (i = 0; i < FILE_HASH_SIZE; i++) { 4416 for (i = 0; i < FILE_HASH_SIZE; i++) {
4502 INIT_LIST_HEAD(&file_hashtbl[i]); 4417 INIT_LIST_HEAD(&file_hashtbl[i]);
4503 } 4418 }
4504 for (i = 0; i < OWNER_HASH_SIZE; i++) { 4419 for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
4505 INIT_LIST_HEAD(&ownerstr_hashtbl[i]); 4420 INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
4506 INIT_LIST_HEAD(&ownerid_hashtbl[i]);
4507 }
4508 for (i = 0; i < STATEID_HASH_SIZE; i++) {
4509 INIT_LIST_HEAD(&stateid_hashtbl[i]);
4510 INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
4511 } 4421 }
4512 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4422 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4513 INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
4514 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); 4423 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
4515 } 4424 }
4516 memset(&onestateid, ~0, sizeof(stateid_t)); 4425 memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void)
4527 int status; 4436 int status;
4528 4437
4529 nfs4_lock_state(); 4438 nfs4_lock_state();
4530 nfsd4_init_recdir(user_recovery_dirname); 4439 nfsd4_init_recdir();
4531 status = nfsd4_recdir_load(); 4440 status = nfsd4_recdir_load();
4532 nfs4_unlock_state(); 4441 nfs4_unlock_state();
4533 if (status) 4442 if (status)
@@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void)
4636 nfs4_unlock_state(); 4545 nfs4_unlock_state();
4637 nfsd4_destroy_callback_queue(); 4546 nfsd4_destroy_callback_queue();
4638} 4547}
4639
4640/*
4641 * user_recovery_dirname is protected by the nfsd_mutex since it's only
4642 * accessed when nfsd is starting.
4643 */
4644static void
4645nfs4_set_recdir(char *recdir)
4646{
4647 strcpy(user_recovery_dirname, recdir);
4648}
4649
4650/*
4651 * Change the NFSv4 recovery directory to recdir.
4652 */
4653int
4654nfs4_reset_recoverydir(char *recdir)
4655{
4656 int status;
4657 struct path path;
4658
4659 status = kern_path(recdir, LOOKUP_FOLLOW, &path);
4660 if (status)
4661 return status;
4662 status = -ENOTDIR;
4663 if (S_ISDIR(path.dentry->d_inode->i_mode)) {
4664 nfs4_set_recdir(recdir);
4665 status = 0;
4666 }
4667 path_put(&path);
4668 return status;
4669}
4670
4671char *
4672nfs4_recoverydir(void)
4673{
4674 return user_recovery_dirname;
4675}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19d..b6fa792d6b8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
456{ 456{
457 DECODE_HEAD; 457 DECODE_HEAD;
458 458
459 close->cl_stateowner = NULL;
460 READ_BUF(4); 459 READ_BUF(4);
461 READ32(close->cl_seqid); 460 READ32(close->cl_seqid);
462 return nfsd4_decode_stateid(argp, &close->cl_stateid); 461 return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
551{ 550{
552 DECODE_HEAD; 551 DECODE_HEAD;
553 552
554 lock->lk_replay_owner = NULL;
555 /* 553 /*
556 * type, reclaim(boolean), offset, length, new_lock_owner(boolean) 554 * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
557 */ 555 */
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
611{ 609{
612 DECODE_HEAD; 610 DECODE_HEAD;
613 611
614 locku->lu_stateowner = NULL;
615 READ_BUF(8); 612 READ_BUF(8);
616 READ32(locku->lu_type); 613 READ32(locku->lu_type);
617 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) 614 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
642 DECODE_TAIL; 639 DECODE_TAIL;
643} 640}
644 641
642static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
643{
644 __be32 *p;
645 u32 w;
646
647 READ_BUF(4);
648 READ32(w);
649 *x = w;
650 switch (w & NFS4_SHARE_ACCESS_MASK) {
651 case NFS4_SHARE_ACCESS_READ:
652 case NFS4_SHARE_ACCESS_WRITE:
653 case NFS4_SHARE_ACCESS_BOTH:
654 break;
655 default:
656 return nfserr_bad_xdr;
657 }
658 w &= ~NFS4_SHARE_ACCESS_MASK;
659 if (!w)
660 return nfs_ok;
661 if (!argp->minorversion)
662 return nfserr_bad_xdr;
663 switch (w & NFS4_SHARE_WANT_MASK) {
664 case NFS4_SHARE_WANT_NO_PREFERENCE:
665 case NFS4_SHARE_WANT_READ_DELEG:
666 case NFS4_SHARE_WANT_WRITE_DELEG:
667 case NFS4_SHARE_WANT_ANY_DELEG:
668 case NFS4_SHARE_WANT_NO_DELEG:
669 case NFS4_SHARE_WANT_CANCEL:
670 break;
671 default:
672 return nfserr_bad_xdr;
673 }
674 w &= ~NFS4_SHARE_WANT_MASK;
675 if (!w)
676 return nfs_ok;
677 switch (w) {
678 case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
679 case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
680 case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
681 NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
682 return nfs_ok;
683 }
684xdr_error:
685 return nfserr_bad_xdr;
686}
687
688static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
689{
690 __be32 *p;
691
692 READ_BUF(4);
693 READ32(*x);
694 /* Note: unlinke access bits, deny bits may be zero. */
695 if (*x & ~NFS4_SHARE_DENY_BOTH)
696 return nfserr_bad_xdr;
697 return nfs_ok;
698xdr_error:
699 return nfserr_bad_xdr;
700}
701
702static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
703{
704 __be32 *p;
705
706 READ_BUF(4);
707 READ32(o->len);
708
709 if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
710 return nfserr_bad_xdr;
711
712 READ_BUF(o->len);
713 SAVEMEM(o->data, o->len);
714 return nfs_ok;
715xdr_error:
716 return nfserr_bad_xdr;
717}
718
645static __be32 719static __be32
646nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) 720nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
647{ 721{
@@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
649 723
650 memset(open->op_bmval, 0, sizeof(open->op_bmval)); 724 memset(open->op_bmval, 0, sizeof(open->op_bmval));
651 open->op_iattr.ia_valid = 0; 725 open->op_iattr.ia_valid = 0;
652 open->op_stateowner = NULL; 726 open->op_openowner = NULL;
653 727
654 /* seqid, share_access, share_deny, clientid, ownerlen */ 728 /* seqid, share_access, share_deny, clientid, ownerlen */
655 READ_BUF(16 + sizeof(clientid_t)); 729 READ_BUF(4);
656 READ32(open->op_seqid); 730 READ32(open->op_seqid);
657 READ32(open->op_share_access); 731 status = nfsd4_decode_share_access(argp, &open->op_share_access);
658 READ32(open->op_share_deny); 732 if (status)
733 goto xdr_error;
734 status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
735 if (status)
736 goto xdr_error;
737 READ_BUF(sizeof(clientid_t));
659 COPYMEM(&open->op_clientid, sizeof(clientid_t)); 738 COPYMEM(&open->op_clientid, sizeof(clientid_t));
660 READ32(open->op_owner.len); 739 status = nfsd4_decode_opaque(argp, &open->op_owner);
661 740 if (status)
662 /* owner, open_flag */ 741 goto xdr_error;
663 READ_BUF(open->op_owner.len + 4); 742 READ_BUF(4);
664 SAVEMEM(open->op_owner.data, open->op_owner.len);
665 READ32(open->op_create); 743 READ32(open->op_create);
666 switch (open->op_create) { 744 switch (open->op_create) {
667 case NFS4_OPEN_NOCREATE: 745 case NFS4_OPEN_NOCREATE:
@@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
727 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 805 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
728 return status; 806 return status;
729 break; 807 break;
808 case NFS4_OPEN_CLAIM_FH:
809 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
810 if (argp->minorversion < 1)
811 goto xdr_error;
812 /* void */
813 break;
814 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
815 if (argp->minorversion < 1)
816 goto xdr_error;
817 status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
818 if (status)
819 return status;
820 break;
730 default: 821 default:
731 goto xdr_error; 822 goto xdr_error;
732 } 823 }
@@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
739{ 830{
740 DECODE_HEAD; 831 DECODE_HEAD;
741 832
742 open_conf->oc_stateowner = NULL;
743 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); 833 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
744 if (status) 834 if (status)
745 return status; 835 return status;
@@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
754{ 844{
755 DECODE_HEAD; 845 DECODE_HEAD;
756 846
757 open_down->od_stateowner = NULL;
758 status = nfsd4_decode_stateid(argp, &open_down->od_stateid); 847 status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
759 if (status) 848 if (status)
760 return status; 849 return status;
761 READ_BUF(12); 850 READ_BUF(4);
762 READ32(open_down->od_seqid); 851 READ32(open_down->od_seqid);
763 READ32(open_down->od_share_access); 852 status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
764 READ32(open_down->od_share_deny); 853 if (status)
765 854 return status;
855 status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
856 if (status)
857 return status;
766 DECODE_TAIL; 858 DECODE_TAIL;
767} 859}
768 860
@@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
903{ 995{
904 DECODE_HEAD; 996 DECODE_HEAD;
905 997
906 READ_BUF(12); 998 READ_BUF(8);
907 COPYMEM(setclientid->se_verf.data, 8); 999 COPYMEM(setclientid->se_verf.data, 8);
908 READ32(setclientid->se_namelen);
909 1000
910 READ_BUF(setclientid->se_namelen + 8); 1001 status = nfsd4_decode_opaque(argp, &setclientid->se_name);
911 SAVEMEM(setclientid->se_name, setclientid->se_namelen); 1002 if (status)
1003 return nfserr_bad_xdr;
1004 READ_BUF(8);
912 READ32(setclientid->se_callback_prog); 1005 READ32(setclientid->se_callback_prog);
913 READ32(setclientid->se_callback_netid_len); 1006 READ32(setclientid->se_callback_netid_len);
914 1007
@@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1051 READ_BUF(NFS4_VERIFIER_SIZE); 1144 READ_BUF(NFS4_VERIFIER_SIZE);
1052 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); 1145 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1053 1146
1054 READ_BUF(4); 1147 status = nfsd4_decode_opaque(argp, &exid->clname);
1055 READ32(exid->clname.len); 1148 if (status)
1056 1149 return nfserr_bad_xdr;
1057 READ_BUF(exid->clname.len);
1058 SAVEMEM(exid->clname.data, exid->clname.len);
1059 1150
1060 READ_BUF(4); 1151 READ_BUF(4);
1061 READ32(exid->flags); 1152 READ32(exid->flags);
@@ -1326,6 +1417,16 @@ xdr_error:
1326 goto out; 1417 goto out;
1327} 1418}
1328 1419
1420static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
1421{
1422 DECODE_HEAD;
1423
1424 READ_BUF(8);
1425 COPYMEM(&dc->clientid, 8);
1426
1427 DECODE_TAIL;
1428}
1429
1329static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) 1430static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
1330{ 1431{
1331 DECODE_HEAD; 1432 DECODE_HEAD;
@@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1447 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1548 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1448 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, 1549 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
1449 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1550 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1450 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, 1551 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
1451 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, 1552 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1452}; 1553};
1453 1554
@@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1630 * we know whether the error to be returned is a sequence id mutating error. 1731 * we know whether the error to be returned is a sequence id mutating error.
1631 */ 1732 */
1632 1733
1633#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ 1734static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
1634 if (seqid_mutating_err(nfserr) && stateowner) { \ 1735{
1635 stateowner->so_seqid++; \ 1736 struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
1636 stateowner->so_replay.rp_status = nfserr; \ 1737
1637 stateowner->so_replay.rp_buflen = \ 1738 if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
1638 (((char *)(resp)->p - (char *)save)); \ 1739 stateowner->so_seqid++;
1639 memcpy(stateowner->so_replay.rp_buf, save, \ 1740 stateowner->so_replay.rp_status = nfserr;
1640 stateowner->so_replay.rp_buflen); \ 1741 stateowner->so_replay.rp_buflen =
1641 } } while (0); 1742 (char *)resp->p - (char *)save;
1743 memcpy(stateowner->so_replay.rp_buf, save,
1744 stateowner->so_replay.rp_buflen);
1745 nfsd4_purge_closed_stateid(stateowner);
1746 }
1747}
1642 1748
1643/* Encode as an array of strings the string given with components 1749/* Encode as an array of strings the string given with components
1644 * separated @sep. 1750 * separated @sep.
@@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1697} 1803}
1698 1804
1699/* 1805/*
1700 * Return the path to an export point in the pseudo filesystem namespace 1806 * Encode a path in RFC3530 'pathname4' format
1701 * Returned string is safe to use as long as the caller holds a reference
1702 * to @exp.
1703 */ 1807 */
1704static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) 1808static __be32 nfsd4_encode_path(const struct path *root,
1809 const struct path *path, __be32 **pp, int *buflen)
1705{ 1810{
1706 struct svc_fh tmp_fh; 1811 struct path cur = {
1707 char *path = NULL, *rootpath; 1812 .mnt = path->mnt,
1708 size_t rootlen; 1813 .dentry = path->dentry,
1814 };
1815 __be32 *p = *pp;
1816 struct dentry **components = NULL;
1817 unsigned int ncomponents = 0;
1818 __be32 err = nfserr_jukebox;
1709 1819
1710 fh_init(&tmp_fh, NFS4_FHSIZE); 1820 dprintk("nfsd4_encode_components(");
1711 *stat = exp_pseudoroot(rqstp, &tmp_fh);
1712 if (*stat)
1713 return NULL;
1714 rootpath = tmp_fh.fh_export->ex_pathname;
1715 1821
1716 path = exp->ex_pathname; 1822 path_get(&cur);
1823 /* First walk the path up to the nfsd root, and store the
1824 * dentries/path components in an array.
1825 */
1826 for (;;) {
1827 if (cur.dentry == root->dentry && cur.mnt == root->mnt)
1828 break;
1829 if (cur.dentry == cur.mnt->mnt_root) {
1830 if (follow_up(&cur))
1831 continue;
1832 goto out_free;
1833 }
1834 if ((ncomponents & 15) == 0) {
1835 struct dentry **new;
1836 new = krealloc(components,
1837 sizeof(*new) * (ncomponents + 16),
1838 GFP_KERNEL);
1839 if (!new)
1840 goto out_free;
1841 components = new;
1842 }
1843 components[ncomponents++] = cur.dentry;
1844 cur.dentry = dget_parent(cur.dentry);
1845 }
1717 1846
1718 rootlen = strlen(rootpath); 1847 *buflen -= 4;
1719 if (strncmp(path, rootpath, rootlen)) { 1848 if (*buflen < 0)
1720 dprintk("nfsd: fs_locations failed;" 1849 goto out_free;
1721 "%s is not contained in %s\n", path, rootpath); 1850 WRITE32(ncomponents);
1722 *stat = nfserr_notsupp; 1851
1723 path = NULL; 1852 while (ncomponents) {
1724 goto out; 1853 struct dentry *dentry = components[ncomponents - 1];
1854 unsigned int len = dentry->d_name.len;
1855
1856 *buflen -= 4 + (XDR_QUADLEN(len) << 2);
1857 if (*buflen < 0)
1858 goto out_free;
1859 WRITE32(len);
1860 WRITEMEM(dentry->d_name.name, len);
1861 dprintk("/%s", dentry->d_name.name);
1862 dput(dentry);
1863 ncomponents--;
1725 } 1864 }
1726 path += rootlen; 1865
1727out: 1866 *pp = p;
1728 fh_put(&tmp_fh); 1867 err = 0;
1729 return path; 1868out_free:
1869 dprintk(")\n");
1870 while (ncomponents)
1871 dput(components[--ncomponents]);
1872 kfree(components);
1873 path_put(&cur);
1874 return err;
1875}
1876
1877static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
1878 const struct path *path, __be32 **pp, int *buflen)
1879{
1880 struct svc_export *exp_ps;
1881 __be32 res;
1882
1883 exp_ps = rqst_find_fsidzero_export(rqstp);
1884 if (IS_ERR(exp_ps))
1885 return nfserrno(PTR_ERR(exp_ps));
1886 res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
1887 exp_put(exp_ps);
1888 return res;
1730} 1889}
1731 1890
1732/* 1891/*
@@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
1740 int i; 1899 int i;
1741 __be32 *p = *pp; 1900 __be32 *p = *pp;
1742 struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; 1901 struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
1743 char *root = nfsd4_path(rqstp, exp, &status);
1744 1902
1745 if (status) 1903 status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
1746 return status;
1747 status = nfsd4_encode_components('/', root, &p, buflen);
1748 if (status) 1904 if (status)
1749 return status; 1905 return status;
1750 if ((*buflen -= 4) < 0) 1906 if ((*buflen -= 4) < 0)
@@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
1760 return 0; 1916 return 0;
1761} 1917}
1762 1918
1763static u32 nfs4_ftypes[16] = { 1919static u32 nfs4_file_type(umode_t mode)
1764 NF4BAD, NF4FIFO, NF4CHR, NF4BAD, 1920{
1765 NF4DIR, NF4BAD, NF4BLK, NF4BAD, 1921 switch (mode & S_IFMT) {
1766 NF4REG, NF4BAD, NF4LNK, NF4BAD, 1922 case S_IFIFO: return NF4FIFO;
1767 NF4SOCK, NF4BAD, NF4LNK, NF4BAD, 1923 case S_IFCHR: return NF4CHR;
1768}; 1924 case S_IFDIR: return NF4DIR;
1925 case S_IFBLK: return NF4BLK;
1926 case S_IFLNK: return NF4LNK;
1927 case S_IFREG: return NF4REG;
1928 case S_IFSOCK: return NF4SOCK;
1929 default: return NF4BAD;
1930 };
1931}
1769 1932
1770static __be32 1933static __be32
1771nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, 1934nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1954 if (bmval0 & FATTR4_WORD0_TYPE) { 2117 if (bmval0 & FATTR4_WORD0_TYPE) {
1955 if ((buflen -= 4) < 0) 2118 if ((buflen -= 4) < 0)
1956 goto out_resource; 2119 goto out_resource;
1957 dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; 2120 dummy = nfs4_file_type(stat.mode);
1958 if (dummy == NF4BAD) 2121 if (dummy == NF4BAD)
1959 goto out_serverfault; 2122 goto out_serverfault;
1960 WRITE32(dummy); 2123 WRITE32(dummy);
@@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
2488 if (!nfserr) 2651 if (!nfserr)
2489 nfsd4_encode_stateid(resp, &close->cl_stateid); 2652 nfsd4_encode_stateid(resp, &close->cl_stateid);
2490 2653
2491 ENCODE_SEQID_OP_TAIL(close->cl_stateowner); 2654 encode_seqid_op_tail(resp, save, nfserr);
2492 return nfserr; 2655 return nfserr;
2493} 2656}
2494 2657
@@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
2564static void 2727static void
2565nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) 2728nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
2566{ 2729{
2730 struct xdr_netobj *conf = &ld->ld_owner;
2567 __be32 *p; 2731 __be32 *p;
2568 2732
2569 RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); 2733 RESERVE_SPACE(32 + XDR_LEN(conf->len));
2570 WRITE64(ld->ld_start); 2734 WRITE64(ld->ld_start);
2571 WRITE64(ld->ld_length); 2735 WRITE64(ld->ld_length);
2572 WRITE32(ld->ld_type); 2736 WRITE32(ld->ld_type);
2573 if (ld->ld_sop) { 2737 if (conf->len) {
2574 WRITEMEM(&ld->ld_clientid, 8); 2738 WRITEMEM(&ld->ld_clientid, 8);
2575 WRITE32(ld->ld_sop->so_owner.len); 2739 WRITE32(conf->len);
2576 WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); 2740 WRITEMEM(conf->data, conf->len);
2577 kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); 2741 kfree(conf->data);
2578 } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ 2742 } else { /* non - nfsv4 lock in conflict, no clientid nor owner */
2579 WRITE64((u64)0); /* clientid */ 2743 WRITE64((u64)0); /* clientid */
2580 WRITE32(0); /* length of owner name */ 2744 WRITE32(0); /* length of owner name */
@@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
2592 else if (nfserr == nfserr_denied) 2756 else if (nfserr == nfserr_denied)
2593 nfsd4_encode_lock_denied(resp, &lock->lk_denied); 2757 nfsd4_encode_lock_denied(resp, &lock->lk_denied);
2594 2758
2595 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); 2759 encode_seqid_op_tail(resp, save, nfserr);
2596 return nfserr; 2760 return nfserr;
2597} 2761}
2598 2762
@@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
2612 if (!nfserr) 2776 if (!nfserr)
2613 nfsd4_encode_stateid(resp, &locku->lu_stateid); 2777 nfsd4_encode_stateid(resp, &locku->lu_stateid);
2614 2778
2615 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); 2779 encode_seqid_op_tail(resp, save, nfserr);
2616 return nfserr; 2780 return nfserr;
2617} 2781}
2618 2782
@@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2693 } 2857 }
2694 /* XXX save filehandle here */ 2858 /* XXX save filehandle here */
2695out: 2859out:
2696 ENCODE_SEQID_OP_TAIL(open->op_stateowner); 2860 encode_seqid_op_tail(resp, save, nfserr);
2697 return nfserr; 2861 return nfserr;
2698} 2862}
2699 2863
@@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
2705 if (!nfserr) 2869 if (!nfserr)
2706 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); 2870 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
2707 2871
2708 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); 2872 encode_seqid_op_tail(resp, save, nfserr);
2709 return nfserr; 2873 return nfserr;
2710} 2874}
2711 2875
@@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
2717 if (!nfserr) 2881 if (!nfserr)
2718 nfsd4_encode_stateid(resp, &od->od_stateid); 2882 nfsd4_encode_stateid(resp, &od->od_stateid);
2719 2883
2720 ENCODE_SEQID_OP_TAIL(od->od_stateowner); 2884 encode_seqid_op_tail(resp, save, nfserr);
2721 return nfserr; 2885 return nfserr;
2722} 2886}
2723 2887
@@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2759 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, 2923 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2760 &maxcount); 2924 &maxcount);
2761 2925
2762 if (nfserr == nfserr_symlink)
2763 nfserr = nfserr_inval;
2764 if (nfserr) 2926 if (nfserr)
2765 return nfserr; 2927 return nfserr;
2766 eof = (read->rd_offset + maxcount >= 2928 eof = (read->rd_offset + maxcount >=
@@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
2886 readdir->common.err == nfserr_toosmall && 3048 readdir->common.err == nfserr_toosmall &&
2887 readdir->buffer == page) 3049 readdir->buffer == page)
2888 nfserr = nfserr_toosmall; 3050 nfserr = nfserr_toosmall;
2889 if (nfserr == nfserr_symlink)
2890 nfserr = nfserr_notdir;
2891 if (nfserr) 3051 if (nfserr)
2892 goto err_no_verf; 3052 goto err_no_verf;
2893 3053
@@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3218 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); 3378 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3219 WRITE32(seq->seqid); 3379 WRITE32(seq->seqid);
3220 WRITE32(seq->slotid); 3380 WRITE32(seq->slotid);
3221 WRITE32(seq->maxslots); 3381 /* Note slotid's are numbered from zero: */
3222 /* For now: target_maxslots = maxslots */ 3382 WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
3223 WRITE32(seq->maxslots); 3383 WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
3224 WRITE32(seq->status_flags); 3384 WRITE32(seq->status_flags);
3225 3385
3226 ADJUST_ARGS(); 3386 ADJUST_ARGS();
@@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
3233 struct nfsd4_test_stateid *test_stateid) 3393 struct nfsd4_test_stateid *test_stateid)
3234{ 3394{
3235 struct nfsd4_compoundargs *argp; 3395 struct nfsd4_compoundargs *argp;
3396 struct nfs4_client *cl = resp->cstate.session->se_client;
3236 stateid_t si; 3397 stateid_t si;
3237 __be32 *p; 3398 __be32 *p;
3238 int i; 3399 int i;
@@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
3248 nfs4_lock_state(); 3409 nfs4_lock_state();
3249 for (i = 0; i < test_stateid->ts_num_ids; i++) { 3410 for (i = 0; i < test_stateid->ts_num_ids; i++) {
3250 nfsd4_decode_stateid(argp, &si); 3411 nfsd4_decode_stateid(argp, &si);
3251 valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); 3412 valid = nfs4_validate_stateid(cl, &si);
3252 RESERVE_SPACE(4); 3413 RESERVE_SPACE(4);
3253 *p++ = htonl(valid); 3414 *p++ = htonl(valid);
3254 resp->p = p; 3415 resp->p = p;
@@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3334 3495
3335/* 3496/*
3336 * Calculate the total amount of memory that the compound response has taken 3497 * Calculate the total amount of memory that the compound response has taken
3337 * after encoding the current operation. 3498 * after encoding the current operation with pad.
3338 * 3499 *
3339 * pad: add on 8 bytes for the next operation's op_code and status so that 3500 * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
3340 * there is room to cache a failure on the next operation. 3501 * which was specified at nfsd4_operation, else pad is zero.
3341 * 3502 *
3342 * Compare this length to the session se_fmaxresp_cached. 3503 * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
3343 * 3504 *
3344 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so 3505 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3345 * will be at least a page and will therefore hold the xdr_buf head. 3506 * will be at least a page and will therefore hold the xdr_buf head.
3346 */ 3507 */
3347static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) 3508int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
3348{ 3509{
3349 int status = 0;
3350 struct xdr_buf *xb = &resp->rqstp->rq_res; 3510 struct xdr_buf *xb = &resp->rqstp->rq_res;
3351 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3352 struct nfsd4_session *session = NULL; 3511 struct nfsd4_session *session = NULL;
3353 struct nfsd4_slot *slot = resp->cstate.slot; 3512 struct nfsd4_slot *slot = resp->cstate.slot;
3354 u32 length, tlen = 0, pad = 8; 3513 u32 length, tlen = 0;
3355 3514
3356 if (!nfsd4_has_session(&resp->cstate)) 3515 if (!nfsd4_has_session(&resp->cstate))
3357 return status; 3516 return 0;
3358 3517
3359 session = resp->cstate.session; 3518 session = resp->cstate.session;
3360 if (session == NULL || slot->sl_cachethis == 0) 3519 if (session == NULL)
3361 return status; 3520 return 0;
3362
3363 if (resp->opcnt >= args->opcnt)
3364 pad = 0; /* this is the last operation */
3365 3521
3366 if (xb->page_len == 0) { 3522 if (xb->page_len == 0) {
3367 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; 3523 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3374 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, 3530 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3375 length, xb->page_len, tlen, pad); 3531 length, xb->page_len, tlen, pad);
3376 3532
3377 if (length <= session->se_fchannel.maxresp_cached) 3533 if (length > session->se_fchannel.maxresp_sz)
3378 return status; 3534 return nfserr_rep_too_big;
3379 else 3535
3536 if (slot->sl_cachethis == 1 &&
3537 length > session->se_fchannel.maxresp_cached)
3380 return nfserr_rep_too_big_to_cache; 3538 return nfserr_rep_too_big_to_cache;
3539
3540 return 0;
3381} 3541}
3382 3542
3383void 3543void
@@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3397 !nfsd4_enc_ops[op->opnum]); 3557 !nfsd4_enc_ops[op->opnum]);
3398 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3558 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3399 /* nfsd4_check_drc_limit guarantees enough room for error status */ 3559 /* nfsd4_check_drc_limit guarantees enough room for error status */
3400 if (!op->status && nfsd4_check_drc_limit(resp)) 3560 if (!op->status)
3401 op->status = nfserr_rep_too_big_to_cache; 3561 op->status = nfsd4_check_resp_size(resp, 0);
3402status: 3562status:
3403 /* 3563 /*
3404 * Note: We write the status directly, instead of using WRITE32(), 3564 * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd..c45a2ea4a09 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,11 +9,11 @@
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
12#include <linux/nfsd/syscall.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/sunrpc/clnt.h> 13#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/gss_api.h> 14#include <linux/sunrpc/gss_api.h>
16#include <linux/sunrpc/gss_krb5_enctypes.h> 15#include <linux/sunrpc/gss_krb5_enctypes.h>
16#include <linux/module.h>
17 17
18#include "idmap.h" 18#include "idmap.h"
19#include "nfsd.h" 19#include "nfsd.h"
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa242030..58134a23fdf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/mount.h> 12#include <linux/mount.h>
13 13
14#include <linux/nfs.h>
15#include <linux/nfs2.h>
16#include <linux/nfs3.h>
17#include <linux/nfs4.h>
18#include <linux/sunrpc/msg_prot.h>
19
14#include <linux/nfsd/debug.h> 20#include <linux/nfsd/debug.h>
15#include <linux/nfsd/export.h> 21#include <linux/nfsd/export.h>
16#include <linux/nfsd/stats.h> 22#include <linux/nfsd/stats.h>
23
17/* 24/*
18 * nfsd version 25 * nfsd version
19 */ 26 */
20#define NFSD_SUPPORTED_MINOR_VERSION 1 27#define NFSD_SUPPORTED_MINOR_VERSION 1
28/*
29 * Maximum blocksizes supported by daemon under various circumstances.
30 */
31#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
32/* NFSv2 is limited by the protocol specification, see RFC 1094 */
33#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
34
35
36/*
37 * Largest number of bytes we need to allocate for an NFS
38 * call or reply. Used to control buffer sizes. We use
39 * the length of v3 WRITE, READDIR and READDIR replies
40 * which are an RPC header, up to 26 XDR units of reply
41 * data, and some page data.
42 *
43 * Note that accuracy here doesn't matter too much as the
44 * size is rounded up to a page size when allocating space.
45 */
46#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
21 47
22struct readdir_cd { 48struct readdir_cd {
23 __be32 err; /* 0, nfserr, or nfserr_eof */ 49 __be32 err; /* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
335#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ 361#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
336 NFSD_WRITEABLE_ATTRS_WORD2 362 NFSD_WRITEABLE_ATTRS_WORD2
337 363
364extern int nfsd4_is_junction(struct dentry *dentry);
365#else
366static inline int nfsd4_is_junction(struct dentry *dentry)
367{
368 return 0;
369}
370
338#endif /* CONFIG_NFSD_V4 */ 371#endif /* CONFIG_NFSD_V4 */
339 372
340#endif /* LINUX_NFSD_NFSD_H */ 373#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0..c763de5c115 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
59 * the write call). 59 * the write call).
60 */ 60 */
61static inline __be32 61static inline __be32
62nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) 62nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
63{ 63{
64 /* Type can be negative when creating hardlinks - not to a dir */ 64 mode &= S_IFMT;
65 if (type > 0 && (mode & S_IFMT) != type) { 65
66 if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) 66 if (requested == 0) /* the caller doesn't care */
67 return nfserr_symlink; 67 return nfs_ok;
68 else if (type == S_IFDIR) 68 if (mode == requested)
69 return nfserr_notdir; 69 return nfs_ok;
70 else if ((mode & S_IFMT) == S_IFDIR) 70 /*
71 return nfserr_isdir; 71 * v4 has an error more specific than err_notdir which we should
72 else 72 * return in preference to err_notdir:
73 return nfserr_inval; 73 */
74 } 74 if (rqstp->rq_vers == 4 && mode == S_IFLNK)
75 if (type < 0 && (mode & S_IFMT) == -type) { 75 return nfserr_symlink;
76 if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) 76 if (requested == S_IFDIR)
77 return nfserr_symlink; 77 return nfserr_notdir;
78 else if (type == -S_IFDIR) 78 if (mode == S_IFDIR)
79 return nfserr_isdir; 79 return nfserr_isdir;
80 else 80 return nfserr_inval;
81 return nfserr_notdir;
82 }
83 return 0;
84} 81}
85 82
86static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, 83static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b..eda7d7e55e0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/freezer.h> 10#include <linux/freezer.h>
11#include <linux/module.h>
11#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
256 nfsd_serv = NULL; 257 nfsd_serv = NULL;
257 nfsd_shutdown(); 258 nfsd_shutdown();
258 259
260 svc_rpcb_cleanup(serv);
261
259 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 262 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
260 "cache\n"); 263 "cache\n");
261 nfsd_export_flush(); 264 nfsd_export_flush();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e..a3cf38476a1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/idr.h>
38#include <linux/sunrpc/svc_xprt.h> 39#include <linux/sunrpc/svc_xprt.h>
39#include <linux/nfsd/nfsfh.h> 40#include <linux/nfsd/nfsfh.h>
40#include "nfsfh.h" 41#include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
45} clientid_t; 46} clientid_t;
46 47
47typedef struct { 48typedef struct {
48 u32 so_boot; 49 clientid_t so_clid;
49 u32 so_stateownerid; 50 u32 so_id;
50 u32 so_fileid;
51} stateid_opaque_t; 51} stateid_opaque_t;
52 52
53typedef struct { 53typedef struct {
54 u32 si_generation; 54 u32 si_generation;
55 stateid_opaque_t si_opaque; 55 stateid_opaque_t si_opaque;
56} stateid_t; 56} stateid_t;
57#define si_boot si_opaque.so_boot
58#define si_stateownerid si_opaque.so_stateownerid
59#define si_fileid si_opaque.so_fileid
60 57
61#define STATEID_FMT "(%08x/%08x/%08x/%08x)" 58#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
62#define STATEID_VAL(s) \ 59#define STATEID_VAL(s) \
63 (s)->si_boot, \ 60 (s)->si_opaque.so_clid.cl_boot, \
64 (s)->si_stateownerid, \ 61 (s)->si_opaque.so_clid.cl_id, \
65 (s)->si_fileid, \ 62 (s)->si_opaque.so_id, \
66 (s)->si_generation 63 (s)->si_generation
67 64
68struct nfsd4_callback { 65struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
76 bool cb_done; 73 bool cb_done;
77}; 74};
78 75
76struct nfs4_stid {
77#define NFS4_OPEN_STID 1
78#define NFS4_LOCK_STID 2
79#define NFS4_DELEG_STID 4
80/* For an open stateid kept around *only* to process close replays: */
81#define NFS4_CLOSED_STID 8
82 unsigned char sc_type;
83 stateid_t sc_stateid;
84 struct nfs4_client *sc_client;
85};
86
79struct nfs4_delegation { 87struct nfs4_delegation {
88 struct nfs4_stid dl_stid; /* must be first field */
80 struct list_head dl_perfile; 89 struct list_head dl_perfile;
81 struct list_head dl_perclnt; 90 struct list_head dl_perclnt;
82 struct list_head dl_recall_lru; /* delegation recalled */ 91 struct list_head dl_recall_lru; /* delegation recalled */
83 atomic_t dl_count; /* ref count */ 92 atomic_t dl_count; /* ref count */
84 struct nfs4_client *dl_client;
85 struct nfs4_file *dl_file; 93 struct nfs4_file *dl_file;
86 u32 dl_type; 94 u32 dl_type;
87 time_t dl_time; 95 time_t dl_time;
88/* For recall: */ 96/* For recall: */
89 stateid_t dl_stateid;
90 struct knfsd_fh dl_fh; 97 struct knfsd_fh dl_fh;
91 int dl_retries; 98 int dl_retries;
92 struct nfsd4_callback dl_recall; 99 struct nfsd4_callback dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
104 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 111 struct svc_xprt *cb_xprt; /* minorversion 1 only */
105}; 112};
106 113
114static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
115{
116 return container_of(s, struct nfs4_delegation, dl_stid);
117}
118
107/* Maximum number of slots per session. 160 is useful for long haul TCP */ 119/* Maximum number of slots per session. 160 is useful for long haul TCP */
108#define NFSD_MAX_SLOTS_PER_SESSION 160 120#define NFSD_MAX_SLOTS_PER_SESSION 160
109/* Maximum number of operations per session compound */ 121/* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
220 struct list_head cl_idhash; /* hash by cl_clientid.id */ 232 struct list_head cl_idhash; /* hash by cl_clientid.id */
221 struct list_head cl_strhash; /* hash by cl_name */ 233 struct list_head cl_strhash; /* hash by cl_name */
222 struct list_head cl_openowners; 234 struct list_head cl_openowners;
235 struct idr cl_stateids; /* stateid lookup */
223 struct list_head cl_delegations; 236 struct list_head cl_delegations;
224 struct list_head cl_lru; /* tail queue */ 237 struct list_head cl_lru; /* tail queue */
225 struct xdr_netobj cl_name; /* id generated by client */ 238 struct xdr_netobj cl_name; /* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
245#define NFSD4_CB_UP 0 258#define NFSD4_CB_UP 0
246#define NFSD4_CB_UNKNOWN 1 259#define NFSD4_CB_UNKNOWN 1
247#define NFSD4_CB_DOWN 2 260#define NFSD4_CB_DOWN 2
261#define NFSD4_CB_FAULT 3
248 int cl_cb_state; 262 int cl_cb_state;
249 struct nfsd4_callback cl_cb_null; 263 struct nfsd4_callback cl_cb_null;
250 struct nfsd4_session *cl_cb_session; 264 struct nfsd4_session *cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
293update_stateid(stateid_t *stateid) 307update_stateid(stateid_t *stateid)
294{ 308{
295 stateid->si_generation++; 309 stateid->si_generation++;
310 /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
311 if (stateid->si_generation == 0)
312 stateid->si_generation = 1;
296} 313}
297 314
298/* A reasonable value for REPLAY_ISIZE was estimated as follows: 315/* A reasonable value for REPLAY_ISIZE was estimated as follows:
@@ -312,49 +329,57 @@ struct nfs4_replay {
312 __be32 rp_status; 329 __be32 rp_status;
313 unsigned int rp_buflen; 330 unsigned int rp_buflen;
314 char *rp_buf; 331 char *rp_buf;
315 unsigned intrp_allocated;
316 struct knfsd_fh rp_openfh; 332 struct knfsd_fh rp_openfh;
317 char rp_ibuf[NFSD4_REPLAY_ISIZE]; 333 char rp_ibuf[NFSD4_REPLAY_ISIZE];
318}; 334};
319 335
320/*
321* nfs4_stateowner can either be an open_owner, or a lock_owner
322*
323* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
324* for lock_owner
325* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
326* for lock_owner
327* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
328* struct is reaped.
329* so_perfilestate: heads the list of nfs4_stateid (either open or lock)
330* and is used to ensure no dangling nfs4_stateid references when we
331* release a stateowner.
332* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
333* close is called to reap associated byte-range locks
334* so_close_lru: (open) stateowner is placed on this list instead of being
335* reaped (when so_perfilestate is empty) to hold the last close replay.
336* reaped by laundramat thread after lease period.
337*/
338struct nfs4_stateowner { 336struct nfs4_stateowner {
339 struct kref so_ref;
340 struct list_head so_idhash; /* hash by so_id */
341 struct list_head so_strhash; /* hash by op_name */ 337 struct list_head so_strhash; /* hash by op_name */
342 struct list_head so_perclient;
343 struct list_head so_stateids; 338 struct list_head so_stateids;
344 struct list_head so_perstateid; /* for lockowners only */
345 struct list_head so_close_lru; /* tail queue */
346 time_t so_time; /* time of placement on so_close_lru */
347 int so_is_open_owner; /* 1=openowner,0=lockowner */
348 u32 so_id;
349 struct nfs4_client * so_client; 339 struct nfs4_client * so_client;
350 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next 340 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
351 * sequence id expected from the client: */ 341 * sequence id expected from the client: */
352 u32 so_seqid; 342 u32 so_seqid;
353 struct xdr_netobj so_owner; /* open owner name */ 343 struct xdr_netobj so_owner; /* open owner name */
354 int so_confirmed; /* successful OPEN_CONFIRM? */
355 struct nfs4_replay so_replay; 344 struct nfs4_replay so_replay;
345 bool so_is_open_owner;
356}; 346};
357 347
348struct nfs4_openowner {
349 struct nfs4_stateowner oo_owner; /* must be first field */
350 struct list_head oo_perclient;
351 /*
352 * We keep around openowners a little while after last close,
353 * which saves clients from having to confirm, and allows us to
354 * handle close replays if they come soon enough. The close_lru
355 * is a list of such openowners, to be reaped by the laundromat
356 * thread eventually if they remain unused:
357 */
358 struct list_head oo_close_lru;
359 struct nfs4_ol_stateid *oo_last_closed_stid;
360 time_t oo_time; /* time of placement on so_close_lru */
361#define NFS4_OO_CONFIRMED 1
362#define NFS4_OO_PURGE_CLOSE 2
363#define NFS4_OO_NEW 4
364 unsigned char oo_flags;
365};
366
367struct nfs4_lockowner {
368 struct nfs4_stateowner lo_owner; /* must be first element */
369 struct list_head lo_perstateid; /* for lockowners only */
370 struct list_head lo_list; /* for temporary uses */
371};
372
373static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
374{
375 return container_of(so, struct nfs4_openowner, oo_owner);
376}
377
378static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
379{
380 return container_of(so, struct nfs4_lockowner, lo_owner);
381}
382
358/* 383/*
359* nfs4_file: a file opened by some number of (open) nfs4_stateowners. 384* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
360* o fi_perfile list is used to search for conflicting 385* o fi_perfile list is used to search for conflicting
@@ -368,17 +393,17 @@ struct nfs4_file {
368 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ 393 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
369 struct file * fi_fds[3]; 394 struct file * fi_fds[3];
370 /* 395 /*
371 * Each open or lock stateid contributes 1 to either 396 * Each open or lock stateid contributes 0-4 to the counts
372 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending 397 * below depending on which bits are set in st_access_bitmap:
373 * on open or lock mode: 398 * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
399 * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
400 * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
374 */ 401 */
375 atomic_t fi_access[2]; 402 atomic_t fi_access[2];
376 struct file *fi_deleg_file; 403 struct file *fi_deleg_file;
377 struct file_lock *fi_lease; 404 struct file_lock *fi_lease;
378 atomic_t fi_delegees; 405 atomic_t fi_delegees;
379 struct inode *fi_inode; 406 struct inode *fi_inode;
380 u32 fi_id; /* used with stateowner->so_id
381 * for stateid_hashtbl hash */
382 bool fi_had_conflict; 407 bool fi_had_conflict;
383}; 408};
384 409
@@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
408 return f->fi_fds[O_RDONLY]; 433 return f->fi_fds[O_RDONLY];
409} 434}
410 435
411/* 436/* "ol" stands for "Open or Lock". Better suggestions welcome. */
412* nfs4_stateid can either be an open stateid or (eventually) a lock stateid 437struct nfs4_ol_stateid {
413* 438 struct nfs4_stid st_stid; /* must be first field */
414* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
415*
416* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
417* st_perfile: file_hashtbl[] entry.
418* st_perfile_state: nfs4_stateowner->so_perfilestate
419* st_perlockowner: (open stateid) list of lock nfs4_stateowners
420* st_access_bmap: used only for open stateid
421* st_deny_bmap: used only for open stateid
422* st_openstp: open stateid lock stateid was derived from
423*
424* XXX: open stateids and lock stateids have diverged sufficiently that
425* we should consider defining separate structs for the two cases.
426*/
427
428struct nfs4_stateid {
429 struct list_head st_hash;
430 struct list_head st_perfile; 439 struct list_head st_perfile;
431 struct list_head st_perstateowner; 440 struct list_head st_perstateowner;
432 struct list_head st_lockowners; 441 struct list_head st_lockowners;
433 struct nfs4_stateowner * st_stateowner; 442 struct nfs4_stateowner * st_stateowner;
434 struct nfs4_file * st_file; 443 struct nfs4_file * st_file;
435 stateid_t st_stateid;
436 unsigned long st_access_bmap; 444 unsigned long st_access_bmap;
437 unsigned long st_deny_bmap; 445 unsigned long st_deny_bmap;
438 struct nfs4_stateid * st_openstp; 446 struct nfs4_ol_stateid * st_openstp;
439}; 447};
440 448
449static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
450{
451 return container_of(s, struct nfs4_ol_stateid, st_stid);
452}
453
441/* flags for preprocess_seqid_op() */ 454/* flags for preprocess_seqid_op() */
442#define HAS_SESSION 0x00000001
443#define CONFIRM 0x00000002
444#define OPEN_STATE 0x00000004
445#define LOCK_STATE 0x00000008
446#define RD_STATE 0x00000010 455#define RD_STATE 0x00000010
447#define WR_STATE 0x00000020 456#define WR_STATE 0x00000020
448#define CLOSE_STATE 0x00000040
449
450#define seqid_mutating_err(err) \
451 (((err) != nfserr_stale_clientid) && \
452 ((err) != nfserr_bad_seqid) && \
453 ((err) != nfserr_stale_stateid) && \
454 ((err) != nfserr_bad_stateid))
455 457
456struct nfsd4_compound_state; 458struct nfsd4_compound_state;
457 459
@@ -461,7 +463,8 @@ extern void nfs4_lock_state(void);
461extern void nfs4_unlock_state(void); 463extern void nfs4_unlock_state(void);
462extern int nfs4_in_grace(void); 464extern int nfs4_in_grace(void);
463extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 465extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
464extern void nfs4_free_stateowner(struct kref *kref); 466extern void nfs4_free_openowner(struct nfs4_openowner *);
467extern void nfs4_free_lockowner(struct nfs4_lockowner *);
465extern int set_callback_cred(void); 468extern int set_callback_cred(void);
466extern void nfsd4_probe_callback(struct nfs4_client *clp); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
467extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
473extern void nfsd4_shutdown_callback(struct nfs4_client *); 476extern void nfsd4_shutdown_callback(struct nfs4_client *);
474extern void nfs4_put_delegation(struct nfs4_delegation *dp); 477extern void nfs4_put_delegation(struct nfs4_delegation *dp);
475extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 478extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
476extern void nfsd4_init_recdir(char *recdir_name); 479extern void nfsd4_init_recdir(void);
477extern int nfsd4_recdir_load(void); 480extern int nfsd4_recdir_load(void);
478extern void nfsd4_shutdown_recdir(void); 481extern void nfsd4_shutdown_recdir(void);
479extern int nfs4_client_to_reclaim(const char *name); 482extern int nfs4_client_to_reclaim(const char *name);
@@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
482extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 485extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
483extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 486extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
484extern void release_session_client(struct nfsd4_session *); 487extern void release_session_client(struct nfsd4_session *);
485extern __be32 nfs4_validate_stateid(stateid_t *, int); 488extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
486 489extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
487static inline void
488nfs4_put_stateowner(struct nfs4_stateowner *so)
489{
490 kref_put(&so->so_ref, nfs4_free_stateowner);
491}
492
493static inline void
494nfs4_get_stateowner(struct nfs4_stateowner *so)
495{
496 kref_get(&so->so_ref);
497}
498 490
499#endif /* NFSD4_STATE_H */ 491#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370..7a2e442623c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
168{ 168{
169 if (d_mountpoint(dentry)) 169 if (d_mountpoint(dentry))
170 return 1; 170 return 1;
171 if (nfsd4_is_junction(dentry))
172 return 1;
171 if (!(exp->ex_flags & NFSEXP_V4ROOT)) 173 if (!(exp->ex_flags & NFSEXP_V4ROOT))
172 return 0; 174 return 0;
173 return dentry->d_inode != NULL; 175 return dentry->d_inode != NULL;
@@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
502 unsigned int flags = 0; 504 unsigned int flags = 0;
503 505
504 /* Get inode */ 506 /* Get inode */
505 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); 507 error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
506 if (error) 508 if (error)
507 return error; 509 return error;
508 510
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
592 return error; 594 return error;
593} 595}
594 596
597#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
598#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
599int nfsd4_is_junction(struct dentry *dentry)
600{
601 struct inode *inode = dentry->d_inode;
602
603 if (inode == NULL)
604 return 0;
605 if (inode->i_mode & S_IXUGO)
606 return 0;
607 if (!(inode->i_mode & S_ISVTX))
608 return 0;
609 if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
610 return 0;
611 return 1;
612}
595#endif /* defined(CONFIG_NFSD_V4) */ 613#endif /* defined(CONFIG_NFSD_V4) */
596 614
597#ifdef CONFIG_NFSD_V3 615#ifdef CONFIG_NFSD_V3
@@ -1352,7 +1370,7 @@ __be32
1352do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1370do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1353 char *fname, int flen, struct iattr *iap, 1371 char *fname, int flen, struct iattr *iap,
1354 struct svc_fh *resfhp, int createmode, u32 *verifier, 1372 struct svc_fh *resfhp, int createmode, u32 *verifier,
1355 int *truncp, int *created) 1373 bool *truncp, bool *created)
1356{ 1374{
1357 struct dentry *dentry, *dchild = NULL; 1375 struct dentry *dentry, *dchild = NULL;
1358 struct inode *dirp; 1376 struct inode *dirp;
@@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1632 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); 1650 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
1633 if (err) 1651 if (err)
1634 goto out; 1652 goto out;
1635 err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); 1653 err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
1636 if (err) 1654 if (err)
1637 goto out; 1655 goto out;
1638 1656 err = nfserr_isdir;
1657 if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
1658 goto out;
1639 err = nfserr_perm; 1659 err = nfserr_perm;
1640 if (!len) 1660 if (!len)
1641 goto out; 1661 goto out;
@@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2114 2134
2115 /* Allow read access to binaries even when mode 111 */ 2135 /* Allow read access to binaries even when mode 111 */
2116 if (err == -EACCES && S_ISREG(inode->i_mode) && 2136 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2117 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2137 (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
2138 acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
2118 err = inode_permission(inode, MAY_EXEC); 2139 err = inode_permission(inode, MAY_EXEC);
2119 2140
2120 return err? nfserrno(err) : 0; 2141 return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1d..3f54ad03bb2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,21 +10,22 @@
10/* 10/*
11 * Flags for nfsd_permission 11 * Flags for nfsd_permission
12 */ 12 */
13#define NFSD_MAY_NOP 0 13#define NFSD_MAY_NOP 0
14#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ 14#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */
15#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ 15#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */
16#define NFSD_MAY_READ 4 /* == MAY_READ */ 16#define NFSD_MAY_READ 0x004 /* == MAY_READ */
17#define NFSD_MAY_SATTR 8 17#define NFSD_MAY_SATTR 0x008
18#define NFSD_MAY_TRUNC 16 18#define NFSD_MAY_TRUNC 0x010
19#define NFSD_MAY_LOCK 32 19#define NFSD_MAY_LOCK 0x020
20#define NFSD_MAY_MASK 63 20#define NFSD_MAY_MASK 0x03f
21 21
22/* extra hints to permission and open routines: */ 22/* extra hints to permission and open routines: */
23#define NFSD_MAY_OWNER_OVERRIDE 64 23#define NFSD_MAY_OWNER_OVERRIDE 0x040
24#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 24#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */
25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100
26#define NFSD_MAY_NOT_BREAK_LEASE 512 26#define NFSD_MAY_NOT_BREAK_LEASE 0x200
27#define NFSD_MAY_BYPASS_GSS 1024 27#define NFSD_MAY_BYPASS_GSS 0x400
28#define NFSD_MAY_READ_IF_EXEC 0x800
28 29
29#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 30#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
30#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 31#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -61,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
61__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, 62__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
62 char *name, int len, struct iattr *attrs, 63 char *name, int len, struct iattr *attrs,
63 struct svc_fh *res, int createmode, 64 struct svc_fh *res, int createmode,
64 u32 *verifier, int *truncp, int *created); 65 u32 *verifier, bool *truncp, bool *created);
65__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, 66__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
66 loff_t, unsigned long); 67 loff_t, unsigned long);
67#endif /* CONFIG_NFSD_V3 */ 68#endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c..2364747ee97 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
81struct nfsd4_close { 81struct nfsd4_close {
82 u32 cl_seqid; /* request */ 82 u32 cl_seqid; /* request */
83 stateid_t cl_stateid; /* request+response */ 83 stateid_t cl_stateid; /* request+response */
84 struct nfs4_stateowner * cl_stateowner; /* response */
85}; 84};
86 85
87struct nfsd4_commit { 86struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
131 130
132struct nfsd4_lock_denied { 131struct nfsd4_lock_denied {
133 clientid_t ld_clientid; 132 clientid_t ld_clientid;
134 struct nfs4_stateowner *ld_sop; 133 struct xdr_netobj ld_owner;
135 u64 ld_start; 134 u64 ld_start;
136 u64 ld_length; 135 u64 ld_length;
137 u32 ld_type; 136 u32 ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
165 } ok; 164 } ok;
166 struct nfsd4_lock_denied denied; 165 struct nfsd4_lock_denied denied;
167 } u; 166 } u;
168 /* The lk_replay_owner is the open owner in the open_to_lock_owner
169 * case and the lock owner otherwise: */
170 struct nfs4_stateowner *lk_replay_owner;
171}; 167};
172#define lk_new_open_seqid v.new.open_seqid 168#define lk_new_open_seqid v.new.open_seqid
173#define lk_new_open_stateid v.new.open_stateid 169#define lk_new_open_stateid v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
188 struct xdr_netobj lt_owner; 184 struct xdr_netobj lt_owner;
189 u64 lt_offset; 185 u64 lt_offset;
190 u64 lt_length; 186 u64 lt_length;
191 struct nfs4_stateowner * lt_stateowner;
192 struct nfsd4_lock_denied lt_denied; 187 struct nfsd4_lock_denied lt_denied;
193}; 188};
194 189
@@ -199,7 +194,6 @@ struct nfsd4_locku {
199 stateid_t lu_stateid; 194 stateid_t lu_stateid;
200 u64 lu_offset; 195 u64 lu_offset;
201 u64 lu_length; 196 u64 lu_length;
202 struct nfs4_stateowner *lu_stateowner;
203}; 197};
204 198
205 199
@@ -232,8 +226,11 @@ struct nfsd4_open {
232 u32 op_recall; /* recall */ 226 u32 op_recall; /* recall */
233 struct nfsd4_change_info op_cinfo; /* response */ 227 struct nfsd4_change_info op_cinfo; /* response */
234 u32 op_rflags; /* response */ 228 u32 op_rflags; /* response */
235 int op_truncate; /* used during processing */ 229 bool op_truncate; /* used during processing */
236 struct nfs4_stateowner *op_stateowner; /* used during processing */ 230 bool op_created; /* used during processing */
231 struct nfs4_openowner *op_openowner; /* used during processing */
232 struct nfs4_file *op_file; /* used during processing */
233 struct nfs4_ol_stateid *op_stp; /* used during processing */
237 struct nfs4_acl *op_acl; 234 struct nfs4_acl *op_acl;
238}; 235};
239#define op_iattr iattr 236#define op_iattr iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
243 stateid_t oc_req_stateid /* request */; 240 stateid_t oc_req_stateid /* request */;
244 u32 oc_seqid /* request */; 241 u32 oc_seqid /* request */;
245 stateid_t oc_resp_stateid /* response */; 242 stateid_t oc_resp_stateid /* response */;
246 struct nfs4_stateowner * oc_stateowner; /* response */
247}; 243};
248 244
249struct nfsd4_open_downgrade { 245struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
251 u32 od_seqid; 247 u32 od_seqid;
252 u32 od_share_access; 248 u32 od_share_access;
253 u32 od_share_deny; 249 u32 od_share_deny;
254 struct nfs4_stateowner *od_stateowner;
255}; 250};
256 251
257 252
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
325 320
326struct nfsd4_setclientid { 321struct nfsd4_setclientid {
327 nfs4_verifier se_verf; /* request */ 322 nfs4_verifier se_verf; /* request */
328 u32 se_namelen; /* request */ 323 struct xdr_netobj se_name;
329 char * se_name; /* request */
330 u32 se_callback_prog; /* request */ 324 u32 se_callback_prog; /* request */
331 u32 se_callback_netid_len; /* request */ 325 u32 se_callback_netid_len; /* request */
332 char * se_callback_netid_val; /* request */ 326 char * se_callback_netid_val; /* request */
@@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs {
351 345
352struct nfsd4_test_stateid { 346struct nfsd4_test_stateid {
353 __be32 ts_num_ids; 347 __be32 ts_num_ids;
354 __be32 ts_has_session;
355 struct nfsd4_compoundargs *ts_saved_args; 348 struct nfsd4_compoundargs *ts_saved_args;
356 struct nfsd4_saved_compoundargs ts_savedp; 349 struct nfsd4_saved_compoundargs ts_savedp;
357}; 350};
@@ -405,6 +398,10 @@ struct nfsd4_destroy_session {
405 struct nfs4_sessionid sessionid; 398 struct nfs4_sessionid sessionid;
406}; 399};
407 400
401struct nfsd4_destroy_clientid {
402 clientid_t clientid;
403};
404
408struct nfsd4_reclaim_complete { 405struct nfsd4_reclaim_complete {
409 u32 rca_one_fs; 406 u32 rca_one_fs;
410}; 407};
@@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
532 struct nfsd4_compoundargs *); 529 struct nfsd4_compoundargs *);
533int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, 530int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
534 struct nfsd4_compoundres *); 531 struct nfsd4_compoundres *);
532int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
535void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); 533void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
536void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 534void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
537__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 535__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
558extern __be32 nfsd4_destroy_session(struct svc_rqst *, 556extern __be32 nfsd4_destroy_session(struct svc_rqst *,
559 struct nfsd4_compound_state *, 557 struct nfsd4_compound_state *,
560 struct nfsd4_destroy_session *); 558 struct nfsd4_destroy_session *);
559extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
561__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); 560__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
562extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 561extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
563 struct nfsd4_open *open); 562 struct nfsd4_open *open);
564extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 563extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
565 struct svc_fh *current_fh, struct nfsd4_open *open); 564 struct svc_fh *current_fh, struct nfsd4_open *open);
565extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
566extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, 566extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
567 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); 567 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
568extern __be32 nfsd4_close(struct svc_rqst *rqstp, 568extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 666628b395f..b50ffb72e5b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
354 354
355 failed_acl: 355 failed_acl:
356 failed_bmap: 356 failed_bmap:
357 inode->i_nlink = 0; 357 clear_nlink(inode);
358 iput(inode); /* raw_inode will be deleted through 358 iput(inode); /* raw_inode will be deleted through
359 generic_delete_inode() */ 359 generic_delete_inode() */
360 goto failed; 360 goto failed;
@@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode,
396 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 396 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
397 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); 397 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
398 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); 398 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
399 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 399 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
400 inode->i_size = le64_to_cpu(raw_inode->i_size); 400 inode->i_size = le64_to_cpu(raw_inode->i_size);
401 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 401 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
402 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); 402 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a3141990061..768982de10e 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
289 nilfs_warning(inode->i_sb, __func__, 289 nilfs_warning(inode->i_sb, __func__,
290 "deleting nonexistent file (%lu), %d\n", 290 "deleting nonexistent file (%lu), %d\n",
291 inode->i_ino, inode->i_nlink); 291 inode->i_ino, inode->i_nlink);
292 inode->i_nlink = 1; 292 set_nlink(inode, 1);
293 } 293 }
294 err = nilfs_delete_entry(de, page); 294 err = nilfs_delete_entry(de, page);
295 if (err) 295 if (err)
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 255d5e1c03b..3777d138f89 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
276/* super.c */ 276/* super.c */
277extern struct inode *nilfs_alloc_inode(struct super_block *); 277extern struct inode *nilfs_alloc_inode(struct super_block *);
278extern void nilfs_destroy_inode(struct inode *); 278extern void nilfs_destroy_inode(struct inode *);
279extern void nilfs_error(struct super_block *, const char *, const char *, ...) 279extern __printf(3, 4)
280 __attribute__ ((format (printf, 3, 4))); 280void nilfs_error(struct super_block *, const char *, const char *, ...);
281extern void nilfs_warning(struct super_block *, const char *, const char *, ...) 281extern __printf(3, 4)
282 __attribute__ ((format (printf, 3, 4))); 282void nilfs_warning(struct super_block *, const char *, const char *, ...);
283extern struct nilfs_super_block * 283extern struct nilfs_super_block *
284nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); 284nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
285extern int nilfs_store_magic_and_option(struct super_block *, 285extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 2142b1c68b6..53c27eaf230 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,8 +30,9 @@
30 30
31extern int debug_msgs; 31extern int debug_msgs;
32 32
33extern void __ntfs_debug(const char *file, int line, const char *function, 33extern __printf(4, 5)
34 const char *format, ...) __attribute__ ((format (printf, 4, 5))); 34void __ntfs_debug(const char *file, int line, const char *function,
35 const char *format, ...);
35/** 36/**
36 * ntfs_debug - write a debug level message to syslog 37 * ntfs_debug - write a debug level message to syslog
37 * @f: a printf format string containing the message 38 * @f: a printf format string containing the message
@@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
52 53
53#endif /* !DEBUG */ 54#endif /* !DEBUG */
54 55
55extern void __ntfs_warning(const char *function, const struct super_block *sb, 56extern __printf(3, 4)
56 const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); 57void __ntfs_warning(const char *function, const struct super_block *sb,
58 const char *fmt, ...);
57#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) 59#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a)
58 60
59extern void __ntfs_error(const char *function, const struct super_block *sb, 61extern __printf(3, 4)
60 const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); 62void __ntfs_error(const char *function, const struct super_block *sb,
63 const char *fmt, ...);
61#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) 64#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a)
62 65
63#endif /* _LINUX_NTFS_DEBUG_H */ 66#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1371487da95..97e2dacbc86 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
612 * might be tricky due to vfs interactions. Need to think about this 612 * might be tricky due to vfs interactions. Need to think about this
613 * some more when implementing the unlink command. 613 * some more when implementing the unlink command.
614 */ 614 */
615 vi->i_nlink = le16_to_cpu(m->link_count); 615 set_nlink(vi, le16_to_cpu(m->link_count));
616 /* 616 /*
617 * FIXME: Reparse points can have the directory bit set even though 617 * FIXME: Reparse points can have the directory bit set even though
618 * they would be S_IFLNK. Need to deal with this further below when we 618 * they would be S_IFLNK. Need to deal with this further below when we
@@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
634 vi->i_mode &= ~vol->dmask; 634 vi->i_mode &= ~vol->dmask;
635 /* Things break without this kludge! */ 635 /* Things break without this kludge! */
636 if (vi->i_nlink > 1) 636 if (vi->i_nlink > 1)
637 vi->i_nlink = 1; 637 set_nlink(vi, 1);
638 } else { 638 } else {
639 vi->i_mode |= S_IFREG; 639 vi->i_mode |= S_IFREG;
640 /* Apply the file permissions mask set in the mount options. */ 640 /* Apply the file permissions mask set in the mount options. */
@@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1242 vi->i_version = base_vi->i_version; 1242 vi->i_version = base_vi->i_version;
1243 vi->i_uid = base_vi->i_uid; 1243 vi->i_uid = base_vi->i_uid;
1244 vi->i_gid = base_vi->i_gid; 1244 vi->i_gid = base_vi->i_gid;
1245 vi->i_nlink = base_vi->i_nlink; 1245 set_nlink(vi, base_vi->i_nlink);
1246 vi->i_mtime = base_vi->i_mtime; 1246 vi->i_mtime = base_vi->i_mtime;
1247 vi->i_ctime = base_vi->i_ctime; 1247 vi->i_ctime = base_vi->i_ctime;
1248 vi->i_atime = base_vi->i_atime; 1248 vi->i_atime = base_vi->i_atime;
@@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
1508 vi->i_version = base_vi->i_version; 1508 vi->i_version = base_vi->i_version;
1509 vi->i_uid = base_vi->i_uid; 1509 vi->i_uid = base_vi->i_uid;
1510 vi->i_gid = base_vi->i_gid; 1510 vi->i_gid = base_vi->i_gid;
1511 vi->i_nlink = base_vi->i_nlink; 1511 set_nlink(vi, base_vi->i_nlink);
1512 vi->i_mtime = base_vi->i_mtime; 1512 vi->i_mtime = base_vi->i_mtime;
1513 vi->i_ctime = base_vi->i_ctime; 1513 vi->i_ctime = base_vi->i_ctime;
1514 vi->i_atime = base_vi->i_atime; 1514 vi->i_atime = base_vi->i_atime;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47..ad7d0c155de 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
59#include <linux/idr.h> 59#include <linux/idr.h>
60#include <linux/kref.h> 60#include <linux/kref.h>
61#include <linux/net.h> 61#include <linux/net.h>
62#include <linux/export.h>
62#include <net/tcp.h> 63#include <net/tcp.h>
63 64
64#include <asm/uaccess.h> 65#include <asm/uaccess.h>
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8582e3f4f12..e2878b5895f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2292,7 +2292,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2292 ocfs2_journal_dirty(handle, di_bh); 2292 ocfs2_journal_dirty(handle, di_bh);
2293 2293
2294 i_size_write(inode, size); 2294 i_size_write(inode, size);
2295 inode->i_nlink = 2; 2295 set_nlink(inode, 2);
2296 inode->i_blocks = ocfs2_inode_sector_count(inode); 2296 inode->i_blocks = ocfs2_inode_sector_count(inode);
2297 2297
2298 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 2298 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2354,7 +2354,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2354 ocfs2_journal_dirty(handle, new_bh); 2354 ocfs2_journal_dirty(handle, new_bh);
2355 2355
2356 i_size_write(inode, inode->i_sb->s_blocksize); 2356 i_size_write(inode, inode->i_sb->s_blocksize);
2357 inode->i_nlink = 2; 2357 set_nlink(inode, 2);
2358 inode->i_blocks = ocfs2_inode_sector_count(inode); 2358 inode->i_blocks = ocfs2_inode_sector_count(inode);
2359 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 2359 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
2360 if (status < 0) { 2360 if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e..0e28e242226 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
30#include <linux/sysctl.h> 30#include <linux/sysctl.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/export.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7642d7ca73e..e1ed5e502ff 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2092,7 +2092,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
2092 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 2092 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
2093 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 2093 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
2094 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 2094 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
2095 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); 2095 set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
2096 ocfs2_unpack_timespec(&inode->i_atime, 2096 ocfs2_unpack_timespec(&inode->i_atime,
2097 be64_to_cpu(lvb->lvb_iatime_packed)); 2097 be64_to_cpu(lvb->lvb_iatime_packed));
2098 ocfs2_unpack_timespec(&inode->i_mtime, 2098 ocfs2_unpack_timespec(&inode->i_mtime,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b4c8bb6b8d2..a22d2c09889 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
291 (unsigned long long)OCFS2_I(inode)->ip_blkno, 291 (unsigned long long)OCFS2_I(inode)->ip_blkno,
292 (unsigned long long)le64_to_cpu(fe->i_blkno)); 292 (unsigned long long)le64_to_cpu(fe->i_blkno));
293 293
294 inode->i_nlink = ocfs2_read_links_count(fe); 294 set_nlink(inode, ocfs2_read_links_count(fe));
295 295
296 trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, 296 trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
297 le32_to_cpu(fe->i_flags)); 297 le32_to_cpu(fe->i_flags));
@@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1290 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1290 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1291 ocfs2_set_inode_flags(inode); 1291 ocfs2_set_inode_flags(inode);
1292 i_size_write(inode, le64_to_cpu(fe->i_size)); 1292 i_size_write(inode, le64_to_cpu(fe->i_size));
1293 inode->i_nlink = ocfs2_read_links_count(fe); 1293 set_nlink(inode, ocfs2_read_links_count(fe));
1294 inode->i_uid = le32_to_cpu(fe->i_uid); 1294 inode->i_uid = le32_to_cpu(fe->i_uid);
1295 inode->i_gid = le32_to_cpu(fe->i_gid); 1295 inode->i_gid = le32_to_cpu(fe->i_gid);
1296 inode->i_mode = le16_to_cpu(fe->i_mode); 1296 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 53aa41ed7bf..a8b2bfea574 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
199 * these are used by the support functions here and in 199 * these are used by the support functions here and in
200 * callers. */ 200 * callers. */
201 if (S_ISDIR(mode)) 201 if (S_ISDIR(mode))
202 inode->i_nlink = 2; 202 set_nlink(inode, 2);
203 else
204 inode->i_nlink = 1;
205 inode_init_owner(inode, dir, mode); 203 inode_init_owner(inode, dir, mode);
206 dquot_initialize(inode); 204 dquot_initialize(inode);
207 return inode; 205 return inode;
@@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir,
1379 } 1377 }
1380 1378
1381 if (new_inode) { 1379 if (new_inode) {
1382 new_inode->i_nlink--; 1380 drop_nlink(new_inode);
1383 new_inode->i_ctime = CURRENT_TIME; 1381 new_inode->i_ctime = CURRENT_TIME;
1384 } 1382 }
1385 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1383 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir,
1387 if (update_dot_dot) { 1385 if (update_dot_dot) {
1388 status = ocfs2_update_entry(old_inode, handle, 1386 status = ocfs2_update_entry(old_inode, handle,
1389 &old_inode_dot_dot_res, new_dir); 1387 &old_inode_dot_dot_res, new_dir);
1390 old_dir->i_nlink--; 1388 drop_nlink(old_dir);
1391 if (new_inode) { 1389 if (new_inode) {
1392 new_inode->i_nlink--; 1390 drop_nlink(new_inode);
1393 } else { 1391 } else {
1394 inc_nlink(new_dir); 1392 inc_nlink(new_dir);
1395 mark_inode_dirty(new_dir); 1393 mark_inode_dirty(new_dir);
@@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2018 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2016 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2019 if (S_ISDIR(inode->i_mode)) 2017 if (S_ISDIR(inode->i_mode))
2020 ocfs2_add_links_count(orphan_fe, 1); 2018 ocfs2_add_links_count(orphan_fe, 1);
2021 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2019 set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
2022 ocfs2_journal_dirty(handle, orphan_dir_bh); 2020 ocfs2_journal_dirty(handle, orphan_dir_bh);
2023 2021
2024 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 2022 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
@@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2116 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2114 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2117 if (S_ISDIR(inode->i_mode)) 2115 if (S_ISDIR(inode->i_mode))
2118 ocfs2_add_links_count(orphan_fe, -1); 2116 ocfs2_add_links_count(orphan_fe, -1);
2119 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2117 set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
2120 ocfs2_journal_dirty(handle, orphan_dir_bh); 2118 ocfs2_journal_dirty(handle, orphan_dir_bh);
2121 2119
2122leave: 2120leave:
@@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2282 goto leave; 2280 goto leave;
2283 } 2281 }
2284 2282
2285 inode->i_nlink = 0; 2283 clear_nlink(inode);
2286 /* do the real work now. */ 2284 /* do the real work now. */
2287 status = __ocfs2_mknod_locked(dir, inode, 2285 status = __ocfs2_mknod_locked(dir, inode,
2288 0, &new_di_bh, parent_di_bh, handle, 2286 0, &new_di_bh, parent_di_bh, handle,
@@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2437 di = (struct ocfs2_dinode *)di_bh->b_data; 2435 di = (struct ocfs2_dinode *)di_bh->b_data;
2438 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2436 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2439 di->i_orphaned_slot = 0; 2437 di->i_orphaned_slot = 0;
2440 inode->i_nlink = 1; 2438 set_nlink(inode, 1);
2441 ocfs2_set_links_count(di, inode->i_nlink); 2439 ocfs2_set_links_count(di, inode->i_nlink);
2442 ocfs2_journal_dirty(handle, di_bh); 2440 ocfs2_journal_dirty(handle, di_bh);
2443 2441
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 40c7de084c1..74ff74cf78f 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq;
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, 31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num); 32 int node_num);
33 33
34void __ocfs2_error(struct super_block *sb, 34__printf(3, 4)
35 const char *function, 35void __ocfs2_error(struct super_block *sb, const char *function,
36 const char *fmt, ...) 36 const char *fmt, ...);
37 __attribute__ ((format (printf, 3, 4)));
38 37
39#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) 38#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
40 39
41void __ocfs2_abort(struct super_block *sb, 40__printf(3, 4)
42 const char *function, 41void __ocfs2_abort(struct super_block *sb, const char *function,
43 const char *fmt, ...) 42 const char *fmt, ...);
44 __attribute__ ((format (printf, 3, 4)));
45 43
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 44#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 45
diff --git a/fs/open.c b/fs/open.c
index f7119210945..22c41b543f2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
685 if (error) 685 if (error)
686 goto cleanup_all; 686 goto cleanup_all;
687 687
688 error = break_lease(inode, f->f_flags);
689 if (error)
690 goto cleanup_all;
691
688 if (!open && f->f_op) 692 if (!open && f->f_op)
689 open = f->f_op->open; 693 open = f->f_op->open;
690 if (open) { 694 if (open) {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a2a5bff774e..e4e0ff7962e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -242,7 +242,7 @@ found:
242 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 242 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
243 inode->i_op = &openprom_inode_operations; 243 inode->i_op = &openprom_inode_operations;
244 inode->i_fop = &openprom_operations; 244 inode->i_fop = &openprom_operations;
245 inode->i_nlink = 2; 245 set_nlink(inode, 2);
246 break; 246 break;
247 case op_inode_prop: 247 case op_inode_prop:
248 if (!strcmp(dp->name, "options") && (len == 17) && 248 if (!strcmp(dp->name, "options") && (len == 17) &&
@@ -251,7 +251,7 @@ found:
251 else 251 else
252 inode->i_mode = S_IFREG | S_IRUGO; 252 inode->i_mode = S_IFREG | S_IRUGO;
253 inode->i_fop = &openpromfs_prop_ops; 253 inode->i_fop = &openpromfs_prop_ops;
254 inode->i_nlink = 1; 254 set_nlink(inode, 1);
255 inode->i_size = ent_oi->u.prop->length; 255 inode->i_size = ent_oi->u.prop->length;
256 break; 256 break;
257 } 257 }
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index af9fdf04676..bd8ae788f68 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -49,18 +49,20 @@
49#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) 49#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a)
50#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) 50#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a)
51 51
52__attribute__ ((format (printf, 3, 4))) 52static __printf(3, 4)
53static void _ldm_printk (const char *level, const char *function, 53void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
54 const char *fmt, ...)
55{ 54{
56 static char buf[128]; 55 struct va_format vaf;
57 va_list args; 56 va_list args;
58 57
59 va_start (args, fmt); 58 va_start (args, fmt);
60 vsnprintf (buf, sizeof (buf), fmt, args);
61 va_end (args);
62 59
63 printk ("%s%s(): %s\n", level, function, buf); 60 vaf.fmt = fmt;
61 vaf.va = &args;
62
63 printk("%s%s(): %pV\n", level, function, &vaf);
64
65 va_end(args);
64} 66}
65 67
66/** 68/**
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0be1dc0f8..4065f07366b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
1254 1254
1255static const struct super_operations pipefs_ops = { 1255static const struct super_operations pipefs_ops = {
1256 .destroy_inode = free_inode_nonrcu, 1256 .destroy_inode = free_inode_nonrcu,
1257 .statfs = simple_statfs,
1257}; 1258};
1258 1259
1259/* 1260/*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 10027b42b7e..cea4623f1ed 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
218 const struct posix_acl_entry *pa, *pe, *mask_obj; 218 const struct posix_acl_entry *pa, *pe, *mask_obj;
219 int found = 0; 219 int found = 0;
220 220
221 want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
222
221 FOREACH_ACL_ENTRY(pa, acl, pe) { 223 FOREACH_ACL_ENTRY(pa, acl, pe) {
222 switch(pa->e_tag) { 224 switch(pa->e_tag) {
223 case ACL_USER_OBJ: 225 case ACL_USER_OBJ:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5eb02069e1b..851ba3dcdc2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1107 goto err_sighand; 1107 goto err_sighand;
1108 } 1108 }
1109 1109
1110 if (oom_adjust != task->signal->oom_adj) {
1111 if (oom_adjust == OOM_DISABLE)
1112 atomic_inc(&task->mm->oom_disable_count);
1113 if (task->signal->oom_adj == OOM_DISABLE)
1114 atomic_dec(&task->mm->oom_disable_count);
1115 }
1116
1117 /* 1110 /*
1118 * Warn that /proc/pid/oom_adj is deprecated, see 1111 * Warn that /proc/pid/oom_adj is deprecated, see
1119 * Documentation/feature-removal-schedule.txt. 1112 * Documentation/feature-removal-schedule.txt.
@@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1215 goto err_sighand; 1208 goto err_sighand;
1216 } 1209 }
1217 1210
1218 if (oom_score_adj != task->signal->oom_score_adj) {
1219 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1220 atomic_inc(&task->mm->oom_disable_count);
1221 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1222 atomic_dec(&task->mm->oom_disable_count);
1223 }
1224 task->signal->oom_score_adj = oom_score_adj; 1211 task->signal->oom_score_adj = oom_score_adj;
1225 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1212 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1226 task->signal->oom_score_adj_min = oom_score_adj; 1213 task->signal->oom_score_adj_min = oom_score_adj;
@@ -2261,7 +2248,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2261 ei = PROC_I(inode); 2248 ei = PROC_I(inode);
2262 inode->i_mode = p->mode; 2249 inode->i_mode = p->mode;
2263 if (S_ISDIR(inode->i_mode)) 2250 if (S_ISDIR(inode->i_mode))
2264 inode->i_nlink = 2; /* Use getattr to fix if necessary */ 2251 set_nlink(inode, 2); /* Use getattr to fix if necessary */
2265 if (p->iop) 2252 if (p->iop)
2266 inode->i_op = p->iop; 2253 inode->i_op = p->iop;
2267 if (p->fop) 2254 if (p->fop)
@@ -2655,7 +2642,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2655 2642
2656 inode->i_mode = p->mode; 2643 inode->i_mode = p->mode;
2657 if (S_ISDIR(inode->i_mode)) 2644 if (S_ISDIR(inode->i_mode))
2658 inode->i_nlink = 2; 2645 set_nlink(inode, 2);
2659 if (S_ISLNK(inode->i_mode)) 2646 if (S_ISLNK(inode->i_mode))
2660 inode->i_size = 64; 2647 inode->i_size = 64;
2661 if (p->iop) 2648 if (p->iop)
@@ -2994,8 +2981,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2994 inode->i_fop = &proc_tgid_base_operations; 2981 inode->i_fop = &proc_tgid_base_operations;
2995 inode->i_flags|=S_IMMUTABLE; 2982 inode->i_flags|=S_IMMUTABLE;
2996 2983
2997 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, 2984 set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
2998 ARRAY_SIZE(tgid_base_stuff)); 2985 ARRAY_SIZE(tgid_base_stuff)));
2999 2986
3000 d_set_d_op(dentry, &pid_dentry_operations); 2987 d_set_d_op(dentry, &pid_dentry_operations);
3001 2988
@@ -3246,8 +3233,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3246 inode->i_fop = &proc_tid_base_operations; 3233 inode->i_fop = &proc_tid_base_operations;
3247 inode->i_flags|=S_IMMUTABLE; 3234 inode->i_flags|=S_IMMUTABLE;
3248 3235
3249 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, 3236 set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
3250 ARRAY_SIZE(tid_base_stuff)); 3237 ARRAY_SIZE(tid_base_stuff)));
3251 3238
3252 d_set_d_op(dentry, &pid_dentry_operations); 3239 d_set_d_op(dentry, &pid_dentry_operations);
3253 3240
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9d99131d0d6..10090d9c7ad 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
283 struct inode *inode = dentry->d_inode; 283 struct inode *inode = dentry->d_inode;
284 struct proc_dir_entry *de = PROC_I(inode)->pde; 284 struct proc_dir_entry *de = PROC_I(inode)->pde;
285 if (de && de->nlink) 285 if (de && de->nlink)
286 inode->i_nlink = de->nlink; 286 set_nlink(inode, de->nlink);
287 287
288 generic_fillattr(inode, stat); 288 generic_fillattr(inode, stat);
289 return 0; 289 return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ed72d6c1c6..7737c5468a4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
445 if (de->size) 445 if (de->size)
446 inode->i_size = de->size; 446 inode->i_size = de->size;
447 if (de->nlink) 447 if (de->nlink)
448 inode->i_nlink = de->nlink; 448 set_nlink(inode, de->nlink);
449 if (de->proc_iops) 449 if (de->proc_iops)
450 inode->i_op = de->proc_iops; 450 inode->i_op = de->proc_iops;
451 if (de->proc_fops) { 451 if (de->proc_fops) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbef226..a6b62173d4c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/sysctl.h> 5#include <linux/sysctl.h>
6#include <linux/poll.h>
6#include <linux/proc_fs.h> 7#include <linux/proc_fs.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <linux/namei.h> 9#include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
14static const struct file_operations proc_sys_dir_file_operations; 15static const struct file_operations proc_sys_dir_file_operations;
15static const struct inode_operations proc_sys_dir_operations; 16static const struct inode_operations proc_sys_dir_operations;
16 17
18void proc_sys_poll_notify(struct ctl_table_poll *poll)
19{
20 if (!poll)
21 return;
22
23 atomic_inc(&poll->event);
24 wake_up_interruptible(&poll->wait);
25}
26
17static struct inode *proc_sys_make_inode(struct super_block *sb, 27static struct inode *proc_sys_make_inode(struct super_block *sb,
18 struct ctl_table_header *head, struct ctl_table *table) 28 struct ctl_table_header *head, struct ctl_table *table)
19{ 29{
@@ -39,7 +49,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
39 inode->i_fop = &proc_sys_file_operations; 49 inode->i_fop = &proc_sys_file_operations;
40 } else { 50 } else {
41 inode->i_mode |= S_IFDIR; 51 inode->i_mode |= S_IFDIR;
42 inode->i_nlink = 0; 52 clear_nlink(inode);
43 inode->i_op = &proc_sys_dir_operations; 53 inode->i_op = &proc_sys_dir_operations;
44 inode->i_fop = &proc_sys_dir_file_operations; 54 inode->i_fop = &proc_sys_dir_file_operations;
45 } 55 }
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
176 return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); 186 return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
177} 187}
178 188
189static int proc_sys_open(struct inode *inode, struct file *filp)
190{
191 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
192
193 if (table->poll)
194 filp->private_data = proc_sys_poll_event(table->poll);
195
196 return 0;
197}
198
199static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
200{
201 struct inode *inode = filp->f_path.dentry->d_inode;
202 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
203 unsigned long event = (unsigned long)filp->private_data;
204 unsigned int ret = DEFAULT_POLLMASK;
205
206 if (!table->proc_handler)
207 goto out;
208
209 if (!table->poll)
210 goto out;
211
212 poll_wait(filp, &table->poll->wait, wait);
213
214 if (event != atomic_read(&table->poll->event)) {
215 filp->private_data = proc_sys_poll_event(table->poll);
216 ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
217 }
218
219out:
220 return ret;
221}
179 222
180static int proc_sys_fill_cache(struct file *filp, void *dirent, 223static int proc_sys_fill_cache(struct file *filp, void *dirent,
181 filldir_t filldir, 224 filldir_t filldir,
@@ -364,12 +407,15 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
364} 407}
365 408
366static const struct file_operations proc_sys_file_operations = { 409static const struct file_operations proc_sys_file_operations = {
410 .open = proc_sys_open,
411 .poll = proc_sys_poll,
367 .read = proc_sys_read, 412 .read = proc_sys_read,
368 .write = proc_sys_write, 413 .write = proc_sys_write,
369 .llseek = default_llseek, 414 .llseek = default_llseek,
370}; 415};
371 416
372static const struct file_operations proc_sys_dir_file_operations = { 417static const struct file_operations proc_sys_dir_file_operations = {
418 .read = generic_read_dir,
373 .readdir = proc_sys_readdir, 419 .readdir = proc_sys_readdir,
374 .llseek = generic_file_llseek, 420 .llseek = generic_file_llseek,
375}; 421};
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1b..42b274da92c 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <asm/cputime.h>
13#include <linux/tick.h>
13 14
14#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
15#define arch_irq_stat_cpu(cpu) 0 16#define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
21#define arch_idle_time(cpu) 0 22#define arch_idle_time(cpu) 0
22#endif 23#endif
23 24
25static cputime64_t get_idle_time(int cpu)
26{
27 u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
28 cputime64_t idle;
29
30 if (idle_time == -1ULL) {
31 /* !NO_HZ so we can rely on cpustat.idle */
32 idle = kstat_cpu(cpu).cpustat.idle;
33 idle = cputime64_add(idle, arch_idle_time(cpu));
34 } else
35 idle = usecs_to_cputime(idle_time);
36
37 return idle;
38}
39
40static cputime64_t get_iowait_time(int cpu)
41{
42 u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
43 cputime64_t iowait;
44
45 if (iowait_time == -1ULL)
46 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait;
48 else
49 iowait = usecs_to_cputime(iowait_time);
50
51 return iowait;
52}
53
24static int show_stat(struct seq_file *p, void *v) 54static int show_stat(struct seq_file *p, void *v)
25{ 55{
26 int i, j; 56 int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
42 user = cputime64_add(user, kstat_cpu(i).cpustat.user); 72 user = cputime64_add(user, kstat_cpu(i).cpustat.user);
43 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 73 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
44 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 74 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
45 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); 75 idle = cputime64_add(idle, get_idle_time(i));
46 idle = cputime64_add(idle, arch_idle_time(i)); 76 iowait = cputime64_add(iowait, get_iowait_time(i));
47 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
48 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 77 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
49 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 78 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
50 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 79 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
76 (unsigned long long)cputime64_to_clock_t(guest), 105 (unsigned long long)cputime64_to_clock_t(guest),
77 (unsigned long long)cputime64_to_clock_t(guest_nice)); 106 (unsigned long long)cputime64_to_clock_t(guest_nice));
78 for_each_online_cpu(i) { 107 for_each_online_cpu(i) {
79
80 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 108 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
81 user = kstat_cpu(i).cpustat.user; 109 user = kstat_cpu(i).cpustat.user;
82 nice = kstat_cpu(i).cpustat.nice; 110 nice = kstat_cpu(i).cpustat.nice;
83 system = kstat_cpu(i).cpustat.system; 111 system = kstat_cpu(i).cpustat.system;
84 idle = kstat_cpu(i).cpustat.idle; 112 idle = get_idle_time(i);
85 idle = cputime64_add(idle, arch_idle_time(i)); 113 iowait = get_iowait_time(i);
86 iowait = kstat_cpu(i).cpustat.iowait;
87 irq = kstat_cpu(i).cpustat.irq; 114 irq = kstat_cpu(i).cpustat.irq;
88 softirq = kstat_cpu(i).cpustat.softirq; 115 softirq = kstat_cpu(i).cpustat.softirq;
89 steal = kstat_cpu(i).cpustat.steal; 116 steal = kstat_cpu(i).cpustat.steal;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5afaa58a863..e418c5abdb0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
44 "VmPeak:\t%8lu kB\n" 44 "VmPeak:\t%8lu kB\n"
45 "VmSize:\t%8lu kB\n" 45 "VmSize:\t%8lu kB\n"
46 "VmLck:\t%8lu kB\n" 46 "VmLck:\t%8lu kB\n"
47 "VmPin:\t%8lu kB\n"
47 "VmHWM:\t%8lu kB\n" 48 "VmHWM:\t%8lu kB\n"
48 "VmRSS:\t%8lu kB\n" 49 "VmRSS:\t%8lu kB\n"
49 "VmData:\t%8lu kB\n" 50 "VmData:\t%8lu kB\n"
@@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
55 hiwater_vm << (PAGE_SHIFT-10), 56 hiwater_vm << (PAGE_SHIFT-10),
56 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 57 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
57 mm->locked_vm << (PAGE_SHIFT-10), 58 mm->locked_vm << (PAGE_SHIFT-10),
59 mm->pinned_vm << (PAGE_SHIFT-10),
58 hiwater_rss << (PAGE_SHIFT-10), 60 hiwater_rss << (PAGE_SHIFT-10),
59 total_rss << (PAGE_SHIFT-10), 61 total_rss << (PAGE_SHIFT-10),
60 data << (PAGE_SHIFT-10), 62 data << (PAGE_SHIFT-10),
@@ -1039,6 +1041,9 @@ static int show_numa_map(struct seq_file *m, void *v)
1039 seq_printf(m, " stack"); 1041 seq_printf(m, " stack");
1040 } 1042 }
1041 1043
1044 if (is_vm_hugetlb_page(vma))
1045 seq_printf(m, " huge");
1046
1042 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1047 walk_page_range(vma->vm_start, vma->vm_end, &walk);
1043 1048
1044 if (!md->pages) 1049 if (!md->pages)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf55765..b0f450a2bb7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/export.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/highmem.h> 17#include <linux/highmem.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 893b961dcfd..379a02dc121 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/list.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/ramfs.h> 30#include <linux/ramfs.h>
@@ -32,13 +33,18 @@
32#include <linux/magic.h> 33#include <linux/magic.h>
33#include <linux/pstore.h> 34#include <linux/pstore.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/spinlock.h>
35#include <linux/uaccess.h> 37#include <linux/uaccess.h>
36 38
37#include "internal.h" 39#include "internal.h"
38 40
39#define PSTORE_NAMELEN 64 41#define PSTORE_NAMELEN 64
40 42
43static DEFINE_SPINLOCK(allpstore_lock);
44static LIST_HEAD(allpstore);
45
41struct pstore_private { 46struct pstore_private {
47 struct list_head list;
42 struct pstore_info *psi; 48 struct pstore_info *psi;
43 enum pstore_type_id type; 49 enum pstore_type_id type;
44 u64 id; 50 u64 id;
@@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
81 87
82static void pstore_evict_inode(struct inode *inode) 88static void pstore_evict_inode(struct inode *inode)
83{ 89{
90 struct pstore_private *p = inode->i_private;
91 unsigned long flags;
92
84 end_writeback(inode); 93 end_writeback(inode);
85 kfree(inode->i_private); 94 if (p) {
95 spin_lock_irqsave(&allpstore_lock, flags);
96 list_del(&p->list);
97 spin_unlock_irqrestore(&allpstore_lock, flags);
98 kfree(p);
99 }
86} 100}
87 101
88static const struct inode_operations pstore_dir_inode_operations = { 102static const struct inode_operations pstore_dir_inode_operations = {
@@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
182 struct dentry *root = pstore_sb->s_root; 196 struct dentry *root = pstore_sb->s_root;
183 struct dentry *dentry; 197 struct dentry *dentry;
184 struct inode *inode; 198 struct inode *inode;
185 int rc; 199 int rc = 0;
186 char name[PSTORE_NAMELEN]; 200 char name[PSTORE_NAMELEN];
187 struct pstore_private *private; 201 struct pstore_private *private, *pos;
202 unsigned long flags;
203
204 spin_lock_irqsave(&allpstore_lock, flags);
205 list_for_each_entry(pos, &allpstore, list) {
206 if (pos->type == type &&
207 pos->id == id &&
208 pos->psi == psi) {
209 rc = -EEXIST;
210 break;
211 }
212 }
213 spin_unlock_irqrestore(&allpstore_lock, flags);
214 if (rc)
215 return rc;
188 216
189 rc = -ENOMEM; 217 rc = -ENOMEM;
190 inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); 218 inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
@@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
229 257
230 d_add(dentry, inode); 258 d_add(dentry, inode);
231 259
260 spin_lock_irqsave(&allpstore_lock, flags);
261 list_add(&private->list, &allpstore);
262 spin_unlock_irqrestore(&allpstore_lock, flags);
263
232 mutex_unlock(&root->d_inode->i_mutex); 264 mutex_unlock(&root->d_inode->i_mutex);
233 265
234 return 0; 266 return 0;
@@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
277 goto fail; 309 goto fail;
278 } 310 }
279 311
280 pstore_get_records(); 312 pstore_get_records(0);
281 313
282 return 0; 314 return 0;
283fail: 315fail:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 611c1b3c46f..3bde461c3f3 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,5 +1,5 @@
1extern void pstore_set_kmsg_bytes(int); 1extern void pstore_set_kmsg_bytes(int);
2extern void pstore_get_records(void); 2extern void pstore_get_records(int);
3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
4 char *data, size_t size, 4 char *data, size_t size,
5 struct timespec time, struct pstore_info *psi); 5 struct timespec time, struct pstore_info *psi);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c5300ec3169..2bd620f0d79 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -25,12 +25,30 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/pstore.h> 26#include <linux/pstore.h>
27#include <linux/string.h> 27#include <linux/string.h>
28#include <linux/timer.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/uaccess.h> 30#include <linux/uaccess.h>
31#include <linux/hardirq.h>
32#include <linux/workqueue.h>
30 33
31#include "internal.h" 34#include "internal.h"
32 35
33/* 36/*
37 * We defer making "oops" entries appear in pstore - see
38 * whether the system is actually still running well enough
39 * to let someone see the entry
40 */
41#define PSTORE_INTERVAL (60 * HZ)
42
43static int pstore_new_entry;
44
45static void pstore_timefunc(unsigned long);
46static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
47
48static void pstore_dowork(struct work_struct *);
49static DECLARE_WORK(pstore_work, pstore_dowork);
50
51/*
34 * pstore_lock just protects "psinfo" during 52 * pstore_lock just protects "psinfo" during
35 * calls to pstore_register() 53 * calls to pstore_register()
36 */ 54 */
@@ -69,15 +87,22 @@ static void pstore_dump(struct kmsg_dumper *dumper,
69 unsigned long size, total = 0; 87 unsigned long size, total = 0;
70 char *dst, *why; 88 char *dst, *why;
71 u64 id; 89 u64 id;
72 int hsize; 90 int hsize, ret;
73 unsigned int part = 1; 91 unsigned int part = 1;
92 unsigned long flags = 0;
93 int is_locked = 0;
74 94
75 if (reason < ARRAY_SIZE(reason_str)) 95 if (reason < ARRAY_SIZE(reason_str))
76 why = reason_str[reason]; 96 why = reason_str[reason];
77 else 97 else
78 why = "Unknown"; 98 why = "Unknown";
79 99
80 mutex_lock(&psinfo->buf_mutex); 100 if (in_nmi()) {
101 is_locked = spin_trylock(&psinfo->buf_lock);
102 if (!is_locked)
103 pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
104 } else
105 spin_lock_irqsave(&psinfo->buf_lock, flags);
81 oopscount++; 106 oopscount++;
82 while (total < kmsg_bytes) { 107 while (total < kmsg_bytes) {
83 dst = psinfo->buf; 108 dst = psinfo->buf;
@@ -97,18 +122,20 @@ static void pstore_dump(struct kmsg_dumper *dumper,
97 memcpy(dst, s1 + s1_start, l1_cpy); 122 memcpy(dst, s1 + s1_start, l1_cpy);
98 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); 123 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
99 124
100 id = psinfo->write(PSTORE_TYPE_DMESG, part, 125 ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part,
101 hsize + l1_cpy + l2_cpy, psinfo); 126 hsize + l1_cpy + l2_cpy, psinfo);
102 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 127 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
103 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, 128 pstore_new_entry = 1;
104 psinfo->buf, hsize + l1_cpy + l2_cpy,
105 CURRENT_TIME, psinfo);
106 l1 -= l1_cpy; 129 l1 -= l1_cpy;
107 l2 -= l2_cpy; 130 l2 -= l2_cpy;
108 total += l1_cpy + l2_cpy; 131 total += l1_cpy + l2_cpy;
109 part++; 132 part++;
110 } 133 }
111 mutex_unlock(&psinfo->buf_mutex); 134 if (in_nmi()) {
135 if (is_locked)
136 spin_unlock(&psinfo->buf_lock);
137 } else
138 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
112} 139}
113 140
114static struct kmsg_dumper pstore_dumper = { 141static struct kmsg_dumper pstore_dumper = {
@@ -148,19 +175,24 @@ int pstore_register(struct pstore_info *psi)
148 } 175 }
149 176
150 if (pstore_is_mounted()) 177 if (pstore_is_mounted())
151 pstore_get_records(); 178 pstore_get_records(0);
152 179
153 kmsg_dump_register(&pstore_dumper); 180 kmsg_dump_register(&pstore_dumper);
154 181
182 pstore_timer.expires = jiffies + PSTORE_INTERVAL;
183 add_timer(&pstore_timer);
184
155 return 0; 185 return 0;
156} 186}
157EXPORT_SYMBOL_GPL(pstore_register); 187EXPORT_SYMBOL_GPL(pstore_register);
158 188
159/* 189/*
160 * Read all the records from the persistent store. Create and 190 * Read all the records from the persistent store. Create
161 * file files in our filesystem. 191 * files in our filesystem. Don't warn about -EEXIST errors
192 * when we are re-scanning the backing store looking to add new
193 * error records.
162 */ 194 */
163void pstore_get_records(void) 195void pstore_get_records(int quiet)
164{ 196{
165 struct pstore_info *psi = psinfo; 197 struct pstore_info *psi = psinfo;
166 ssize_t size; 198 ssize_t size;
@@ -168,36 +200,55 @@ void pstore_get_records(void)
168 enum pstore_type_id type; 200 enum pstore_type_id type;
169 struct timespec time; 201 struct timespec time;
170 int failed = 0, rc; 202 int failed = 0, rc;
203 unsigned long flags;
171 204
172 if (!psi) 205 if (!psi)
173 return; 206 return;
174 207
175 mutex_lock(&psinfo->buf_mutex); 208 spin_lock_irqsave(&psinfo->buf_lock, flags);
176 rc = psi->open(psi); 209 rc = psi->open(psi);
177 if (rc) 210 if (rc)
178 goto out; 211 goto out;
179 212
180 while ((size = psi->read(&id, &type, &time, psi)) > 0) { 213 while ((size = psi->read(&id, &type, &time, psi)) > 0) {
181 if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, 214 rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
182 time, psi)) 215 time, psi);
216 if (rc && (rc != -EEXIST || !quiet))
183 failed++; 217 failed++;
184 } 218 }
185 psi->close(psi); 219 psi->close(psi);
186out: 220out:
187 mutex_unlock(&psinfo->buf_mutex); 221 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
188 222
189 if (failed) 223 if (failed)
190 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", 224 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
191 failed, psi->name); 225 failed, psi->name);
192} 226}
193 227
228static void pstore_dowork(struct work_struct *work)
229{
230 pstore_get_records(1);
231}
232
233static void pstore_timefunc(unsigned long dummy)
234{
235 if (pstore_new_entry) {
236 pstore_new_entry = 0;
237 schedule_work(&pstore_work);
238 }
239
240 mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL);
241}
242
194/* 243/*
195 * Call platform driver to write a record to the 244 * Call platform driver to write a record to the
196 * persistent store. 245 * persistent store.
197 */ 246 */
198int pstore_write(enum pstore_type_id type, char *buf, size_t size) 247int pstore_write(enum pstore_type_id type, char *buf, size_t size)
199{ 248{
200 u64 id; 249 u64 id;
250 int ret;
251 unsigned long flags;
201 252
202 if (!psinfo) 253 if (!psinfo)
203 return -ENODEV; 254 return -ENODEV;
@@ -205,13 +256,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
205 if (size > psinfo->bufsize) 256 if (size > psinfo->bufsize)
206 return -EFBIG; 257 return -EFBIG;
207 258
208 mutex_lock(&psinfo->buf_mutex); 259 spin_lock_irqsave(&psinfo->buf_lock, flags);
209 memcpy(psinfo->buf, buf, size); 260 memcpy(psinfo->buf, buf, size);
210 id = psinfo->write(type, 0, size, psinfo); 261 ret = psinfo->write(type, &id, 0, size, psinfo);
211 if (pstore_is_mounted()) 262 if (ret == 0 && pstore_is_mounted())
212 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, 263 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
213 size, CURRENT_TIME, psinfo); 264 size, CURRENT_TIME, psinfo);
214 mutex_unlock(&psinfo->buf_mutex); 265 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
215 266
216 return 0; 267 return 0;
217} 268}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2b0646613f5..3bdd2141843 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
379 inode->i_mode = le16_to_cpu(raw_inode->di_mode); 379 inode->i_mode = le16_to_cpu(raw_inode->di_mode);
380 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); 380 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid);
381 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); 381 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid);
382 inode->i_nlink = le16_to_cpu(raw_inode->di_nlink); 382 set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
383 inode->i_size = le32_to_cpu(raw_inode->di_size); 383 inode->i_size = le32_to_cpu(raw_inode->di_size);
384 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); 384 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime);
385 inode->i_mtime.tv_nsec = 0; 385 inode->i_mtime.tv_nsec = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 10b6be3ca28..35f4b0ecdeb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
286 /* caller already holds s_umount */ 286 /* caller already holds s_umount */
287 if (sb->s_flags & MS_RDONLY) 287 if (sb->s_flags & MS_RDONLY)
288 return -EROFS; 288 return -EROFS;
289 writeback_inodes_sb(sb); 289 writeback_inodes_sb(sb, WB_REASON_SYNC);
290 return 0; 290 return 0;
291 default: 291 default:
292 return -EINVAL; 292 return -EINVAL;
@@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
363 } 363 }
364 364
365 sb = quotactl_block(special); 365 sb = quotactl_block(special);
366 if (IS_ERR(sb)) 366 if (IS_ERR(sb)) {
367 return PTR_ERR(sb); 367 ret = PTR_ERR(sb);
368 goto out;
369 }
368 370
369 ret = do_quotactl(sb, type, cmds, id, addr, pathp); 371 ret = do_quotactl(sb, type, cmds, id, addr, pathp);
370 372
371 drop_super(sb); 373 drop_super(sb);
374out:
372 if (pathp && !IS_ERR(pathp)) 375 if (pathp && !IS_ERR(pathp))
373 path_put(pathp); 376 path_put(pathp);
374 return ret; 377 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eacb166fb25..462ceb38fec 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -23,7 +23,6 @@
23 * caches is sufficient. 23 * caches is sufficient.
24 */ 24 */
25 25
26#include <linux/module.h>
27#include <linux/fs.h> 26#include <linux/fs.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
@@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void)
288{ 287{
289 return register_filesystem(&ramfs_fs_type); 288 return register_filesystem(&ramfs_fs_type);
290} 289}
291
292static void __exit exit_ramfs_fs(void)
293{
294 unregister_filesystem(&ramfs_fs_type);
295}
296
297module_init(init_ramfs_fs) 290module_init(init_ramfs_fs)
298module_exit(exit_ramfs_fs)
299 291
300int __init init_rootfs(void) 292int __init init_rootfs(void)
301{ 293{
@@ -311,5 +303,3 @@ int __init init_rootfs(void)
311 303
312 return err; 304 return err;
313} 305}
314
315MODULE_LICENSE("GPL");
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea5..5ad4248b0cd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
35 return file->f_mode & FMODE_UNSIGNED_OFFSET; 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
36} 36}
37 37
38static loff_t lseek_execute(struct file *file, struct inode *inode,
39 loff_t offset, loff_t maxsize)
40{
41 if (offset < 0 && !unsigned_offsets(file))
42 return -EINVAL;
43 if (offset > maxsize)
44 return -EINVAL;
45
46 if (offset != file->f_pos) {
47 file->f_pos = offset;
48 file->f_version = 0;
49 }
50 return offset;
51}
52
38/** 53/**
39 * generic_file_llseek_unlocked - lockless generic llseek implementation 54 * generic_file_llseek_size - generic llseek implementation for regular files
40 * @file: file structure to seek on 55 * @file: file structure to seek on
41 * @offset: file offset to seek to 56 * @offset: file offset to seek to
42 * @origin: type of seek 57 * @origin: type of seek
58 * @size: max size of file system
59 *
60 * This is a variant of generic_file_llseek that allows passing in a custom
61 * file size.
43 * 62 *
44 * Updates the file offset to the value specified by @offset and @origin. 63 * Synchronization:
45 * Locking must be provided by the caller. 64 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
65 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
66 * read/writes behave like SEEK_SET against seeks.
46 */ 67 */
47loff_t 68loff_t
48generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) 69generic_file_llseek_size(struct file *file, loff_t offset, int origin,
70 loff_t maxsize)
49{ 71{
50 struct inode *inode = file->f_mapping->host; 72 struct inode *inode = file->f_mapping->host;
51 73
52 switch (origin) { 74 switch (origin) {
53 case SEEK_END: 75 case SEEK_END:
54 offset += inode->i_size; 76 offset += i_size_read(inode);
55 break; 77 break;
56 case SEEK_CUR: 78 case SEEK_CUR:
57 /* 79 /*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 */ 84 */
63 if (offset == 0) 85 if (offset == 0)
64 return file->f_pos; 86 return file->f_pos;
65 offset += file->f_pos; 87 /*
66 break; 88 * f_lock protects against read/modify/write race with other
89 * SEEK_CURs. Note that parallel writes and reads behave
90 * like SEEK_SET.
91 */
92 spin_lock(&file->f_lock);
93 offset = lseek_execute(file, inode, file->f_pos + offset,
94 maxsize);
95 spin_unlock(&file->f_lock);
96 return offset;
67 case SEEK_DATA: 97 case SEEK_DATA:
68 /* 98 /*
69 * In the generic case the entire file is data, so as long as 99 * In the generic case the entire file is data, so as long as
70 * offset isn't at the end of the file then the offset is data. 100 * offset isn't at the end of the file then the offset is data.
71 */ 101 */
72 if (offset >= inode->i_size) 102 if (offset >= i_size_read(inode))
73 return -ENXIO; 103 return -ENXIO;
74 break; 104 break;
75 case SEEK_HOLE: 105 case SEEK_HOLE:
@@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
77 * There is a virtual hole at the end of the file, so as long as 107 * There is a virtual hole at the end of the file, so as long as
78 * offset isn't i_size or larger, return i_size. 108 * offset isn't i_size or larger, return i_size.
79 */ 109 */
80 if (offset >= inode->i_size) 110 if (offset >= i_size_read(inode))
81 return -ENXIO; 111 return -ENXIO;
82 offset = inode->i_size; 112 offset = i_size_read(inode);
83 break; 113 break;
84 } 114 }
85 115
86 if (offset < 0 && !unsigned_offsets(file)) 116 return lseek_execute(file, inode, offset, maxsize);
87 return -EINVAL;
88 if (offset > inode->i_sb->s_maxbytes)
89 return -EINVAL;
90
91 /* Special lock needed here? */
92 if (offset != file->f_pos) {
93 file->f_pos = offset;
94 file->f_version = 0;
95 }
96
97 return offset;
98} 117}
99EXPORT_SYMBOL(generic_file_llseek_unlocked); 118EXPORT_SYMBOL(generic_file_llseek_size);
100 119
101/** 120/**
102 * generic_file_llseek - generic llseek implementation for regular files 121 * generic_file_llseek - generic llseek implementation for regular files
@@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked);
110 */ 129 */
111loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 130loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
112{ 131{
113 loff_t rval; 132 struct inode *inode = file->f_mapping->host;
114
115 mutex_lock(&file->f_dentry->d_inode->i_mutex);
116 rval = generic_file_llseek_unlocked(file, offset, origin);
117 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
118 133
119 return rval; 134 return generic_file_llseek_size(file, offset, origin,
135 inode->i_sb->s_maxbytes);
120} 136}
121EXPORT_SYMBOL(generic_file_llseek); 137EXPORT_SYMBOL(generic_file_llseek);
122 138
@@ -617,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
617ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 633ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
618 unsigned long nr_segs, unsigned long fast_segs, 634 unsigned long nr_segs, unsigned long fast_segs,
619 struct iovec *fast_pointer, 635 struct iovec *fast_pointer,
620 struct iovec **ret_pointer) 636 struct iovec **ret_pointer,
637 int check_access)
621{ 638{
622 unsigned long seg; 639 unsigned long seg;
623 ssize_t ret; 640 ssize_t ret;
@@ -673,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
673 ret = -EINVAL; 690 ret = -EINVAL;
674 goto out; 691 goto out;
675 } 692 }
676 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { 693 if (check_access
694 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
677 ret = -EFAULT; 695 ret = -EFAULT;
678 goto out; 696 goto out;
679 } 697 }
@@ -705,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
705 } 723 }
706 724
707 ret = rw_copy_check_uvector(type, uvector, nr_segs, 725 ret = rw_copy_check_uvector(type, uvector, nr_segs,
708 ARRAY_SIZE(iovstack), iovstack, &iov); 726 ARRAY_SIZE(iovstack), iovstack, &iov, 1);
709 if (ret <= 0) 727 if (ret <= 0)
710 goto out; 728 goto out;
711 729
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9b0d4b78b4f..950f13af095 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
1154 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1154 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1155 set_inode_sd_version(inode, STAT_DATA_V1); 1155 set_inode_sd_version(inode, STAT_DATA_V1);
1156 inode->i_mode = sd_v1_mode(sd); 1156 inode->i_mode = sd_v1_mode(sd);
1157 inode->i_nlink = sd_v1_nlink(sd); 1157 set_nlink(inode, sd_v1_nlink(sd));
1158 inode->i_uid = sd_v1_uid(sd); 1158 inode->i_uid = sd_v1_uid(sd);
1159 inode->i_gid = sd_v1_gid(sd); 1159 inode->i_gid = sd_v1_gid(sd);
1160 inode->i_size = sd_v1_size(sd); 1160 inode->i_size = sd_v1_size(sd);
@@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
1199 struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); 1199 struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1200 1200
1201 inode->i_mode = sd_v2_mode(sd); 1201 inode->i_mode = sd_v2_mode(sd);
1202 inode->i_nlink = sd_v2_nlink(sd); 1202 set_nlink(inode, sd_v2_nlink(sd));
1203 inode->i_uid = sd_v2_uid(sd); 1203 inode->i_uid = sd_v2_uid(sd);
1204 inode->i_size = sd_v2_size(sd); 1204 inode->i_size = sd_v2_size(sd);
1205 inode->i_gid = sd_v2_gid(sd); 1205 inode->i_gid = sd_v2_gid(sd);
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
1444 /* a stale NFS handle can trigger this without it being an error */ 1444 /* a stale NFS handle can trigger this without it being an error */
1445 pathrelse(&path_to_sd); 1445 pathrelse(&path_to_sd);
1446 reiserfs_make_bad_inode(inode); 1446 reiserfs_make_bad_inode(inode);
1447 inode->i_nlink = 0; 1447 clear_nlink(inode);
1448 return; 1448 return;
1449 } 1449 }
1450 1450
@@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1832#endif 1832#endif
1833 1833
1834 /* fill stat data */ 1834 /* fill stat data */
1835 inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); 1835 set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
1836 1836
1837 /* uid and gid must already be set by the caller for quota init */ 1837 /* uid and gid must already be set by the caller for quota init */
1838 1838
@@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1987 make_bad_inode(inode); 1987 make_bad_inode(inode);
1988 1988
1989 out_inserted_sd: 1989 out_inserted_sd:
1990 inode->i_nlink = 0; 1990 clear_nlink(inode);
1991 th->t_trans_id = 0; /* so the caller can't use this handle later */ 1991 th->t_trans_id = 0; /* so the caller can't use this handle later */
1992 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ 1992 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1993 iput(inode); 1993 iput(inode);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ef392324bbf..80058e8ce36 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
19#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
20#include <linux/quotaops.h> 20#include <linux/quotaops.h>
21 21
22#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } 22#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
23#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); 23#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
24 24
25// directory item contains array of entry headers. This performs 25// directory item contains array of entry headers. This performs
@@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
622 dentry->d_name.len, inode, 1 /*visible */ ); 622 dentry->d_name.len, inode, 1 /*visible */ );
623 if (retval) { 623 if (retval) {
624 int err; 624 int err;
625 inode->i_nlink--; 625 drop_nlink(inode);
626 reiserfs_update_sd(&th, inode); 626 reiserfs_update_sd(&th, inode);
627 err = journal_end(&th, dir->i_sb, jbegin_count); 627 err = journal_end(&th, dir->i_sb, jbegin_count);
628 if (err) 628 if (err)
@@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
702 dentry->d_name.len, inode, 1 /*visible */ ); 702 dentry->d_name.len, inode, 1 /*visible */ );
703 if (retval) { 703 if (retval) {
704 int err; 704 int err;
705 inode->i_nlink--; 705 drop_nlink(inode);
706 reiserfs_update_sd(&th, inode); 706 reiserfs_update_sd(&th, inode);
707 err = journal_end(&th, dir->i_sb, jbegin_count); 707 err = journal_end(&th, dir->i_sb, jbegin_count);
708 if (err) 708 if (err)
@@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
787 dentry->d_name.len, inode, 1 /*visible */ ); 787 dentry->d_name.len, inode, 1 /*visible */ );
788 if (retval) { 788 if (retval) {
789 int err; 789 int err;
790 inode->i_nlink = 0; 790 clear_nlink(inode);
791 DEC_DIR_INODE_NLINK(dir); 791 DEC_DIR_INODE_NLINK(dir);
792 reiserfs_update_sd(&th, inode); 792 reiserfs_update_sd(&th, inode);
793 err = journal_end(&th, dir->i_sb, jbegin_count); 793 err = journal_end(&th, dir->i_sb, jbegin_count);
@@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
964 reiserfs_warning(inode->i_sb, "reiserfs-7042", 964 reiserfs_warning(inode->i_sb, "reiserfs-7042",
965 "deleting nonexistent file (%lu), %d", 965 "deleting nonexistent file (%lu), %d",
966 inode->i_ino, inode->i_nlink); 966 inode->i_ino, inode->i_nlink);
967 inode->i_nlink = 1; 967 set_nlink(inode, 1);
968 } 968 }
969 969
970 drop_nlink(inode); 970 drop_nlink(inode);
@@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
1086 dentry->d_name.len, inode, 1 /*visible */ ); 1086 dentry->d_name.len, inode, 1 /*visible */ );
1087 if (retval) { 1087 if (retval) {
1088 int err; 1088 int err;
1089 inode->i_nlink--; 1089 drop_nlink(inode);
1090 reiserfs_update_sd(&th, inode); 1090 reiserfs_update_sd(&th, inode);
1091 err = journal_end(&th, parent_dir->i_sb, jbegin_count); 1091 err = journal_end(&th, parent_dir->i_sb, jbegin_count);
1092 if (err) 1092 if (err)
@@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1129 1129
1130 retval = journal_begin(&th, dir->i_sb, jbegin_count); 1130 retval = journal_begin(&th, dir->i_sb, jbegin_count);
1131 if (retval) { 1131 if (retval) {
1132 inode->i_nlink--; 1132 drop_nlink(inode);
1133 reiserfs_write_unlock(dir->i_sb); 1133 reiserfs_write_unlock(dir->i_sb);
1134 return retval; 1134 return retval;
1135 } 1135 }
@@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1144 1144
1145 if (retval) { 1145 if (retval) {
1146 int err; 1146 int err;
1147 inode->i_nlink--; 1147 drop_nlink(inode);
1148 err = journal_end(&th, dir->i_sb, jbegin_count); 1148 err = journal_end(&th, dir->i_sb, jbegin_count);
1149 reiserfs_write_unlock(dir->i_sb); 1149 reiserfs_write_unlock(dir->i_sb);
1150 return err ? err : retval; 1150 return err ? err : retval;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2305e3121cb..8b4089f3040 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
337 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; 337 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
338 inode->i_dataoffset = pos + inode->i_metasize; 338 inode->i_dataoffset = pos + inode->i_metasize;
339 339
340 i->i_nlink = 1; /* Hard to decide.. */ 340 set_nlink(i, 1); /* Hard to decide.. */
341 i->i_size = be32_to_cpu(ri.size); 341 i->i_size = be32_to_cpu(ri.size);
342 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; 342 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
343 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 343 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f..c70111ebefd 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
78 78
79 If unsure, say N. 79 If unsure, say N.
80 80
81config SQUASHFS_4K_DEVBLK_SIZE
82 bool "Use 4K device block size?"
83 depends on SQUASHFS
84 help
85 By default Squashfs sets the dev block size (sb_min_blocksize)
86 to 1K or the smallest block size supported by the block device
87 (if larger). This, because blocks are packed together and
88 unaligned in Squashfs, should reduce latency.
89
90 This, however, gives poor performance on MTD NAND devices where
91 the optimal I/O size is 4K (even though the devices can support
92 smaller block sizes).
93
94 Using a 4K device block size may also improve overall I/O
95 performance for some file access patterns (e.g. sequential
96 accesses of files in filesystem order) on all media.
97
98 Setting this option will force Squashfs to use a 4K device block
99 size by default.
100
101 If unsure, say N.
102
81config SQUASHFS_EMBEDDED 103config SQUASHFS_EMBEDDED
82 bool "Additional option for memory-constrained systems" 104 bool "Additional option for memory-constrained systems"
83 depends on SQUASHFS 105 depends on SQUASHFS
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 04bebcaa237..fd7b3b3bda1 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
159 frag_offset = 0; 159 frag_offset = 0;
160 } 160 }
161 161
162 inode->i_nlink = 1; 162 set_nlink(inode, 1);
163 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 163 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
164 inode->i_fop = &generic_ro_fops; 164 inode->i_fop = &generic_ro_fops;
165 inode->i_mode |= S_IFREG; 165 inode->i_mode |= S_IFREG;
@@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
203 } 203 }
204 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr); 205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
207 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops; 208 inode->i_op = &squashfs_inode_ops;
209 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
@@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
232 if (err < 0) 232 if (err < 0)
233 goto failed_read; 233 goto failed_read;
234 234
235 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 235 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
236 inode->i_size = le16_to_cpu(sqsh_ino->file_size); 236 inode->i_size = le16_to_cpu(sqsh_ino->file_size);
237 inode->i_op = &squashfs_dir_inode_ops; 237 inode->i_op = &squashfs_dir_inode_ops;
238 inode->i_fop = &squashfs_dir_ops; 238 inode->i_fop = &squashfs_dir_ops;
@@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
257 goto failed_read; 257 goto failed_read;
258 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr); 259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
261 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
262 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
263 inode->i_fop = &squashfs_dir_ops; 263 inode->i_fop = &squashfs_dir_ops;
@@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
284 if (err < 0) 284 if (err < 0)
285 goto failed_read; 285 goto failed_read;
286 286
287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
289 inode->i_op = &squashfs_symlink_inode_ops; 289 inode->i_op = &squashfs_symlink_inode_ops;
290 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
@@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
325 inode->i_mode |= S_IFCHR; 325 inode->i_mode |= S_IFCHR;
326 else 326 else
327 inode->i_mode |= S_IFBLK; 327 inode->i_mode |= S_IFBLK;
328 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 328 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
329 rdev = le32_to_cpu(sqsh_ino->rdev); 329 rdev = le32_to_cpu(sqsh_ino->rdev);
330 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); 330 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
331 331
@@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
349 inode->i_mode |= S_IFBLK; 349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr); 350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops; 351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 352 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
353 rdev = le32_to_cpu(sqsh_ino->rdev); 353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); 354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355 355
@@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
370 inode->i_mode |= S_IFIFO; 370 inode->i_mode |= S_IFIFO;
371 else 371 else
372 inode->i_mode |= S_IFSOCK; 372 inode->i_mode |= S_IFSOCK;
373 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 373 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
374 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
375 break; 375 break;
376 } 376 }
@@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
389 inode->i_mode |= S_IFSOCK; 389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr); 390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops; 391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 392 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
393 init_special_inode(inode, inode->i_mode, 0); 393 init_special_inode(inode, inode->i_mode, 0);
394 break; 394 break;
395 } 395 }
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08..e8e14645de9 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
36#define SQUASHFS_FILE_SIZE 131072 36#define SQUASHFS_FILE_SIZE 131072
37#define SQUASHFS_FILE_LOG 17 37#define SQUASHFS_FILE_LOG 17
38 38
39/* default size of block device I/O */
40#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
41#define SQUASHFS_DEVBLK_SIZE 4096
42#else
43#define SQUASHFS_DEVBLK_SIZE 1024
44#endif
45
39#define SQUASHFS_FILE_MAX_SIZE 1048576 46#define SQUASHFS_FILE_MAX_SIZE 1048576
40#define SQUASHFS_FILE_MAX_LOG 20 47#define SQUASHFS_FILE_MAX_LOG 20
41 48
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d..2da1715452a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
95 } 95 }
96 msblk = sb->s_fs_info; 96 msblk = sb->s_fs_info;
97 97
98 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); 98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
99 msblk->devblksize_log2 = ffz(~msblk->devblksize); 99 msblk->devblksize_log2 = ffz(~msblk->devblksize);
100 100
101 mutex_init(&msblk->read_data_mutex); 101 mutex_init(&msblk->read_data_mutex);
diff --git a/fs/stack.c b/fs/stack.c
index b4f2ab48a61..9c11519245a 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
71 dest->i_ctime = src->i_ctime; 71 dest->i_ctime = src->i_ctime;
72 dest->i_blkbits = src->i_blkbits; 72 dest->i_blkbits = src->i_blkbits;
73 dest->i_flags = src->i_flags; 73 dest->i_flags = src->i_flags;
74 dest->i_nlink = src->i_nlink; 74 set_nlink(dest, src->i_nlink);
75} 75}
76EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); 76EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 78a3aa83c7e..8806b8997d2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
294{ 294{
295 struct path path; 295 struct path path;
296 int error; 296 int error;
297 int empty = 0;
297 298
298 if (bufsiz <= 0) 299 if (bufsiz <= 0)
299 return -EINVAL; 300 return -EINVAL;
300 301
301 error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); 302 error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
302 if (!error) { 303 if (!error) {
303 struct inode *inode = path.dentry->d_inode; 304 struct inode *inode = path.dentry->d_inode;
304 305
305 error = -EINVAL; 306 error = empty ? -ENOENT : -EINVAL;
306 if (inode->i_op->readlink) { 307 if (inode->i_op->readlink) {
307 error = security_inode_readlink(path.dentry); 308 error = security_inode_readlink(path.dentry);
308 if (!error) { 309 if (!error) {
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec5..9cf04a11896 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
76int user_statfs(const char __user *pathname, struct kstatfs *st) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct path path; 78 struct path path;
79 int error = user_path(pathname, &path); 79 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
80 if (!error) { 80 if (!error) {
81 error = vfs_statfs(&path, st); 81 error = vfs_statfs(&path, st);
82 path_put(&path); 82 path_put(&path);
diff --git a/fs/super.c b/fs/super.c
index 3f56a269a4f..afd0f1ad45e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
61 return -1; 61 return -1;
62 62
63 if (!grab_super_passive(sb)) 63 if (!grab_super_passive(sb))
64 return -1; 64 return !sc->nr_to_scan ? 0 : -1;
65 65
66 if (sb->s_op && sb->s_op->nr_cached_objects) 66 if (sb->s_op && sb->s_op->nr_cached_objects)
67 fs_objects = sb->s_op->nr_cached_objects(sb); 67 fs_objects = sb->s_op->nr_cached_objects(sb);
@@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
727 727
728 if (sb->s_op->remount_fs) { 728 if (sb->s_op->remount_fs) {
729 retval = sb->s_op->remount_fs(sb, &flags, data); 729 retval = sb->s_op->remount_fs(sb, &flags, data);
730 if (retval) 730 if (retval) {
731 return retval; 731 if (!force)
732 return retval;
733 /* If forced remount, go ahead despite any errors */
734 WARN(1, "forced remount of a %s fs returned %i\n",
735 sb->s_type->name, retval);
736 }
732 } 737 }
733 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 738 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
734 739
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edf..101b8ef901d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
43 if (wait) 43 if (wait)
44 sync_inodes_sb(sb); 44 sync_inodes_sb(sb);
45 else 45 else
46 writeback_inodes_sb(sb); 46 writeback_inodes_sb(sb, WB_REASON_SYNC);
47 47
48 if (sb->s_op->sync_fs) 48 if (sb->s_op->sync_fs)
49 sb->s_op->sync_fs(sb, wait); 49 sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
98 */ 98 */
99SYSCALL_DEFINE0(sync) 99SYSCALL_DEFINE0(sync)
100{ 100{
101 wakeup_flusher_threads(0); 101 wakeup_flusher_threads(0, WB_REASON_SYNC);
102 sync_filesystems(0); 102 sync_filesystems(0);
103 sync_filesystems(1); 103 sync_filesystems(1);
104 if (unlikely(laptop_mode)) 104 if (unlikely(laptop_mode))
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d..7fdf6a7b743 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida);
43static void sysfs_link_sibling(struct sysfs_dirent *sd) 43static void sysfs_link_sibling(struct sysfs_dirent *sd)
44{ 44{
45 struct sysfs_dirent *parent_sd = sd->s_parent; 45 struct sysfs_dirent *parent_sd = sd->s_parent;
46 struct sysfs_dirent **pos;
47 46
48 BUG_ON(sd->s_sibling); 47 struct rb_node **p;
49 48 struct rb_node *parent;
50 /* Store directory entries in order by ino. This allows 49
51 * readdir to properly restart without having to add a 50 if (sysfs_type(sd) == SYSFS_DIR)
52 * cursor into the s_dir.children list. 51 parent_sd->s_dir.subdirs++;
53 */ 52
54 for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { 53 p = &parent_sd->s_dir.inode_tree.rb_node;
55 if (sd->s_ino < (*pos)->s_ino) 54 parent = NULL;
56 break; 55 while (*p) {
56 parent = *p;
57#define node rb_entry(parent, struct sysfs_dirent, inode_node)
58 if (sd->s_ino < node->s_ino) {
59 p = &node->inode_node.rb_left;
60 } else if (sd->s_ino > node->s_ino) {
61 p = &node->inode_node.rb_right;
62 } else {
63 printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
64 (unsigned long) sd->s_ino);
65 BUG();
66 }
67#undef node
57 } 68 }
58 sd->s_sibling = *pos; 69 rb_link_node(&sd->inode_node, parent, p);
59 *pos = sd; 70 rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
71
72 p = &parent_sd->s_dir.name_tree.rb_node;
73 parent = NULL;
74 while (*p) {
75 int c;
76 parent = *p;
77#define node rb_entry(parent, struct sysfs_dirent, name_node)
78 c = strcmp(sd->s_name, node->s_name);
79 if (c < 0) {
80 p = &node->name_node.rb_left;
81 } else {
82 p = &node->name_node.rb_right;
83 }
84#undef node
85 }
86 rb_link_node(&sd->name_node, parent, p);
87 rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
60} 88}
61 89
62/** 90/**
@@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
71 */ 99 */
72static void sysfs_unlink_sibling(struct sysfs_dirent *sd) 100static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
73{ 101{
74 struct sysfs_dirent **pos; 102 if (sysfs_type(sd) == SYSFS_DIR)
103 sd->s_parent->s_dir.subdirs--;
75 104
76 for (pos = &sd->s_parent->s_dir.children; *pos; 105 rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
77 pos = &(*pos)->s_sibling) { 106 rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
78 if (*pos == sd) {
79 *pos = sd->s_sibling;
80 sd->s_sibling = NULL;
81 break;
82 }
83 }
84} 107}
85 108
86/** 109/**
@@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
126 */ 149 */
127void sysfs_put_active(struct sysfs_dirent *sd) 150void sysfs_put_active(struct sysfs_dirent *sd)
128{ 151{
129 struct completion *cmpl;
130 int v; 152 int v;
131 153
132 if (unlikely(!sd)) 154 if (unlikely(!sd))
@@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
138 return; 160 return;
139 161
140 /* atomic_dec_return() is a mb(), we'll always see the updated 162 /* atomic_dec_return() is a mb(), we'll always see the updated
141 * sd->s_sibling. 163 * sd->u.completion.
142 */ 164 */
143 cmpl = (void *)sd->s_sibling; 165 complete(sd->u.completion);
144 complete(cmpl);
145} 166}
146 167
147/** 168/**
@@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
155 DECLARE_COMPLETION_ONSTACK(wait); 176 DECLARE_COMPLETION_ONSTACK(wait);
156 int v; 177 int v;
157 178
158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 179 BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
159 180
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) 181 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return; 182 return;
162 183
163 sd->s_sibling = (void *)&wait; 184 sd->u.completion = (void *)&wait;
164 185
165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); 186 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
166 /* atomic_add_return() is a mb(), put_active() will always see 187 /* atomic_add_return() is a mb(), put_active() will always see
167 * the updated sd->s_sibling. 188 * the updated sd->u.completion.
168 */ 189 */
169 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); 190 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
170 191
@@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
173 wait_for_completion(&wait); 194 wait_for_completion(&wait);
174 } 195 }
175 196
176 sd->s_sibling = NULL;
177
178 lock_acquired(&sd->dep_map, _RET_IP_); 197 lock_acquired(&sd->dep_map, _RET_IP_);
179 rwsem_release(&sd->dep_map, 1, _RET_IP_); 198 rwsem_release(&sd->dep_map, 1, _RET_IP_);
180} 199}
@@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
384{ 403{
385 struct sysfs_inode_attrs *ps_iattr; 404 struct sysfs_inode_attrs *ps_iattr;
386 405
406 if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
407 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
408 sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
409 acxt->parent_sd->s_name, sd->s_name);
410 return -EINVAL;
411 }
412
387 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) 413 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
388 return -EEXIST; 414 return -EEXIST;
389 415
@@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
490 } 516 }
491 517
492 sd->s_flags |= SYSFS_FLAG_REMOVED; 518 sd->s_flags |= SYSFS_FLAG_REMOVED;
493 sd->s_sibling = acxt->removed; 519 sd->u.removed_list = acxt->removed;
494 acxt->removed = sd; 520 acxt->removed = sd;
495} 521}
496 522
@@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
514 while (acxt->removed) { 540 while (acxt->removed) {
515 struct sysfs_dirent *sd = acxt->removed; 541 struct sysfs_dirent *sd = acxt->removed;
516 542
517 acxt->removed = sd->s_sibling; 543 acxt->removed = sd->u.removed_list;
518 sd->s_sibling = NULL;
519 544
520 sysfs_deactivate(sd); 545 sysfs_deactivate(sd);
521 unmap_bin_file(sd); 546 unmap_bin_file(sd);
@@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
540 const void *ns, 565 const void *ns,
541 const unsigned char *name) 566 const unsigned char *name)
542{ 567{
543 struct sysfs_dirent *sd; 568 struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
569 struct sysfs_dirent *found = NULL;
544 570
545 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { 571 if (!!sysfs_ns_type(parent_sd) != !!ns) {
546 if (ns && sd->s_ns && (sd->s_ns != ns)) 572 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
547 continue; 573 sysfs_ns_type(parent_sd)? "required": "invalid",
548 if (!strcmp(sd->s_name, name)) 574 parent_sd->s_name, name);
549 return sd; 575 return NULL;
550 } 576 }
551 return NULL; 577
578 while (p) {
579 int c;
580#define node rb_entry(p, struct sysfs_dirent, name_node)
581 c = strcmp(name, node->s_name);
582 if (c < 0) {
583 p = node->name_node.rb_left;
584 } else if (c > 0) {
585 p = node->name_node.rb_right;
586 } else {
587 found = node;
588 p = node->name_node.rb_left;
589 }
590#undef node
591 }
592
593 if (found) {
594 while (found->s_ns != ns) {
595 p = rb_next(&found->name_node);
596 if (!p)
597 return NULL;
598 found = rb_entry(p, struct sysfs_dirent, name_node);
599 if (strcmp(name, found->s_name))
600 return NULL;
601 }
602 }
603
604 return found;
552} 605}
553 606
554/** 607/**
@@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
744static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) 797static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
745{ 798{
746 struct sysfs_addrm_cxt acxt; 799 struct sysfs_addrm_cxt acxt;
747 struct sysfs_dirent **pos; 800 struct rb_node *pos;
748 801
749 if (!dir_sd) 802 if (!dir_sd)
750 return; 803 return;
751 804
752 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); 805 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
753 sysfs_addrm_start(&acxt, dir_sd); 806 sysfs_addrm_start(&acxt, dir_sd);
754 pos = &dir_sd->s_dir.children; 807 pos = rb_first(&dir_sd->s_dir.inode_tree);
755 while (*pos) { 808 while (pos) {
756 struct sysfs_dirent *sd = *pos; 809 struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
757 810 pos = rb_next(pos);
758 if (sysfs_type(sd) != SYSFS_DIR) 811 if (sysfs_type(sd) != SYSFS_DIR)
759 sysfs_remove_one(&acxt, sd); 812 sysfs_remove_one(&acxt, sd);
760 else
761 pos = &(*pos)->s_sibling;
762 } 813 }
763 sysfs_addrm_finish(&acxt); 814 sysfs_addrm_finish(&acxt);
764 815
@@ -814,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd,
814 sd->s_name = new_name; 865 sd->s_name = new_name;
815 } 866 }
816 867
817 /* Remove from old parent's list and insert into new parent's list. */ 868 /* Move to the appropriate place in the appropriate directories rbtree. */
818 if (sd->s_parent != new_parent_sd) { 869 sysfs_unlink_sibling(sd);
819 sysfs_unlink_sibling(sd); 870 sysfs_get(new_parent_sd);
820 sysfs_get(new_parent_sd); 871 sysfs_put(sd->s_parent);
821 sysfs_put(sd->s_parent);
822 sd->s_parent = new_parent_sd;
823 sysfs_link_sibling(sd);
824 }
825 sd->s_ns = new_ns; 872 sd->s_ns = new_ns;
873 sd->s_parent = new_parent_sd;
874 sysfs_link_sibling(sd);
826 875
827 error = 0; 876 error = 0;
828 out: 877 out:
@@ -881,12 +930,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
881 pos = NULL; 930 pos = NULL;
882 } 931 }
883 if (!pos && (ino > 1) && (ino < INT_MAX)) { 932 if (!pos && (ino > 1) && (ino < INT_MAX)) {
884 pos = parent_sd->s_dir.children; 933 struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
885 while (pos && (ino > pos->s_ino)) 934 while (p) {
886 pos = pos->s_sibling; 935#define node rb_entry(p, struct sysfs_dirent, inode_node)
936 if (ino < node->s_ino) {
937 pos = node;
938 p = node->inode_node.rb_left;
939 } else if (ino > node->s_ino) {
940 p = node->inode_node.rb_right;
941 } else {
942 pos = node;
943 break;
944 }
945#undef node
946 }
947 }
948 while (pos && pos->s_ns != ns) {
949 struct rb_node *p = rb_next(&pos->inode_node);
950 if (!p)
951 pos = NULL;
952 else
953 pos = rb_entry(p, struct sysfs_dirent, inode_node);
887 } 954 }
888 while (pos && pos->s_ns && pos->s_ns != ns)
889 pos = pos->s_sibling;
890 return pos; 955 return pos;
891} 956}
892 957
@@ -894,10 +959,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
894 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) 959 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
895{ 960{
896 pos = sysfs_dir_pos(ns, parent_sd, ino, pos); 961 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
897 if (pos) 962 if (pos) do {
898 pos = pos->s_sibling; 963 struct rb_node *p = rb_next(&pos->inode_node);
899 while (pos && pos->s_ns && pos->s_ns != ns) 964 if (!p)
900 pos = pos->s_sibling; 965 pos = NULL;
966 else
967 pos = rb_entry(p, struct sysfs_dirent, inode_node);
968 } while (pos && pos->s_ns != ns);
901 return pos; 969 return pos;
902} 970}
903 971
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b8..d4e6080b4b2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
466 mutex_lock(&sysfs_mutex); 466 mutex_lock(&sysfs_mutex);
467 467
468 if (sd && dir) 468 if (sd && dir)
469 /* Only directories are tagged, so no need to pass
470 * a tag explicitly.
471 */
472 sd = sysfs_find_dirent(sd, NULL, dir); 469 sd = sysfs_find_dirent(sd, NULL, dir);
473 if (sd && attr) 470 if (sd && attr)
474 sd = sysfs_find_dirent(sd, NULL, attr); 471 sd = sysfs_find_dirent(sd, NULL, attr);
@@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = {
488 .poll = sysfs_poll, 485 .poll = sysfs_poll,
489}; 486};
490 487
488int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
489 const void **pns)
490{
491 struct sysfs_dirent *dir_sd = kobj->sd;
492 const struct sysfs_ops *ops;
493 const void *ns = NULL;
494 int err;
495
496 err = 0;
497 if (!sysfs_ns_type(dir_sd))
498 goto out;
499
500 err = -EINVAL;
501 if (!kobj->ktype)
502 goto out;
503 ops = kobj->ktype->sysfs_ops;
504 if (!ops)
505 goto out;
506 if (!ops->namespace)
507 goto out;
508
509 err = 0;
510 ns = ops->namespace(kobj, attr);
511out:
512 if (err) {
513 WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
514 "kobject: %s\n", kobject_name(kobj));
515 }
516 *pns = ns;
517 return err;
518}
519
491int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, 520int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
492 const struct attribute *attr, int type, mode_t amode) 521 const struct attribute *attr, int type, mode_t amode)
493{ 522{
494 umode_t mode = (amode & S_IALLUGO) | S_IFREG; 523 umode_t mode = (amode & S_IALLUGO) | S_IFREG;
495 struct sysfs_addrm_cxt acxt; 524 struct sysfs_addrm_cxt acxt;
496 struct sysfs_dirent *sd; 525 struct sysfs_dirent *sd;
526 const void *ns;
497 int rc; 527 int rc;
498 528
529 rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
530 if (rc)
531 return rc;
532
499 sd = sysfs_new_dirent(attr->name, mode, type); 533 sd = sysfs_new_dirent(attr->name, mode, type);
500 if (!sd) 534 if (!sd)
501 return -ENOMEM; 535 return -ENOMEM;
536
537 sd->s_ns = ns;
502 sd->s_attr.attr = (void *)attr; 538 sd->s_attr.attr = (void *)attr;
503 sysfs_dirent_init_lockdep(sd); 539 sysfs_dirent_init_lockdep(sd);
504 540
@@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
586{ 622{
587 struct sysfs_dirent *sd; 623 struct sysfs_dirent *sd;
588 struct iattr newattrs; 624 struct iattr newattrs;
625 const void *ns;
589 int rc; 626 int rc;
590 627
628 rc = sysfs_attr_ns(kobj, attr, &ns);
629 if (rc)
630 return rc;
631
591 mutex_lock(&sysfs_mutex); 632 mutex_lock(&sysfs_mutex);
592 633
593 rc = -ENOENT; 634 rc = -ENOENT;
594 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); 635 sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
595 if (!sd) 636 if (!sd)
596 goto out; 637 goto out;
597 638
@@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
616 657
617void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 658void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
618{ 659{
619 sysfs_hash_and_remove(kobj->sd, NULL, attr->name); 660 const void *ns;
661
662 if (sysfs_attr_ns(kobj, attr, &ns))
663 return;
664
665 sysfs_hash_and_remove(kobj->sd, ns, attr->name);
620} 666}
621 667
622void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 668void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c7..c81b22f3ace 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
202 inode->i_ctime = iattr->ia_ctime; 202 inode->i_ctime = iattr->ia_ctime;
203} 203}
204 204
205static int sysfs_count_nlink(struct sysfs_dirent *sd)
206{
207 struct sysfs_dirent *child;
208 int nr = 0;
209
210 for (child = sd->s_dir.children; child; child = child->s_sibling)
211 if (sysfs_type(child) == SYSFS_DIR)
212 nr++;
213
214 return nr + 2;
215}
216
217static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) 205static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
218{ 206{
219 struct sysfs_inode_attrs *iattrs = sd->s_iattr; 207 struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
230 } 218 }
231 219
232 if (sysfs_type(sd) == SYSFS_DIR) 220 if (sysfs_type(sd) == SYSFS_DIR)
233 inode->i_nlink = sysfs_count_nlink(sd); 221 set_nlink(inode, sd->s_dir.subdirs + 2);
234} 222}
235 223
236int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 224int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
336 sysfs_addrm_start(&acxt, dir_sd); 324 sysfs_addrm_start(&acxt, dir_sd);
337 325
338 sd = sysfs_find_dirent(dir_sd, ns, name); 326 sd = sysfs_find_dirent(dir_sd, ns, name);
339 if (sd && (sd->s_ns != ns))
340 sd = NULL;
341 if (sd) 327 if (sd)
342 sysfs_remove_one(&acxt, sd); 328 sysfs_remove_one(&acxt, sd);
343 329
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229..ce29e28b766 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,14 +11,18 @@
11#include <linux/lockdep.h> 11#include <linux/lockdep.h>
12#include <linux/kobject_ns.h> 12#include <linux/kobject_ns.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/rbtree.h>
14 15
15struct sysfs_open_dirent; 16struct sysfs_open_dirent;
16 17
17/* type-specific structures for sysfs_dirent->s_* union members */ 18/* type-specific structures for sysfs_dirent->s_* union members */
18struct sysfs_elem_dir { 19struct sysfs_elem_dir {
19 struct kobject *kobj; 20 struct kobject *kobj;
20 /* children list starts here and goes through sd->s_sibling */ 21
21 struct sysfs_dirent *children; 22 unsigned long subdirs;
23
24 struct rb_root inode_tree;
25 struct rb_root name_tree;
22}; 26};
23 27
24struct sysfs_elem_symlink { 28struct sysfs_elem_symlink {
@@ -56,9 +60,16 @@ struct sysfs_dirent {
56 struct lockdep_map dep_map; 60 struct lockdep_map dep_map;
57#endif 61#endif
58 struct sysfs_dirent *s_parent; 62 struct sysfs_dirent *s_parent;
59 struct sysfs_dirent *s_sibling;
60 const char *s_name; 63 const char *s_name;
61 64
65 struct rb_node inode_node;
66 struct rb_node name_node;
67
68 union {
69 struct completion *completion;
70 struct sysfs_dirent *removed_list;
71 } u;
72
62 const void *s_ns; /* namespace tag */ 73 const void *s_ns; /* namespace tag */
63 union { 74 union {
64 struct sysfs_elem_dir s_dir; 75 struct sysfs_elem_dir s_dir;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0630eb969a2..25ffb3e9a3f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
219 inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); 219 inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
220 inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); 220 inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid);
221 inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); 221 inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid);
222 inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink); 222 set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
223 inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); 223 inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
224 inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); 224 inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
225 inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); 225 inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b..bc4f94b2870 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
63static void shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
64{ 64{
65 down_read(&c->vfs_sb->s_umount); 65 down_read(&c->vfs_sb->s_umount);
66 writeback_inodes_sb(c->vfs_sb); 66 writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
67 up_read(&c->vfs_sb->s_umount); 67 up_read(&c->vfs_sb->s_umount);
68} 68}
69 69
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a92..b09ba2dd8b6 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
870 spin_unlock(&dbg_lock); 870 spin_unlock(&dbg_lock);
871} 871}
872 872
873void dbg_dump_sleb(const struct ubifs_info *c,
874 const struct ubifs_scan_leb *sleb, int offs)
875{
876 struct ubifs_scan_node *snod;
877
878 printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
879 current->pid, sleb->lnum, offs);
880
881 list_for_each_entry(snod, &sleb->nodes, list) {
882 cond_resched();
883 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
884 snod->offs, snod->len);
885 dbg_dump_node(c, snod->node);
886 }
887}
888
873void dbg_dump_leb(const struct ubifs_info *c, int lnum) 889void dbg_dump_leb(const struct ubifs_info *c, int lnum)
874{ 890{
875 struct ubifs_scan_leb *sleb; 891 struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252a..8d9c4681018 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
269void dbg_dump_lprops(struct ubifs_info *c); 269void dbg_dump_lprops(struct ubifs_info *c);
270void dbg_dump_lpt_info(struct ubifs_info *c); 270void dbg_dump_lpt_info(struct ubifs_info *c);
271void dbg_dump_leb(const struct ubifs_info *c, int lnum); 271void dbg_dump_leb(const struct ubifs_info *c, int lnum);
272void dbg_dump_sleb(const struct ubifs_info *c,
273 const struct ubifs_scan_leb *sleb, int offs);
272void dbg_dump_znode(const struct ubifs_info *c, 274void dbg_dump_znode(const struct ubifs_info *c,
273 const struct ubifs_znode *znode); 275 const struct ubifs_znode *znode);
274void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); 276void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
387static inline void dbg_dump_leb(const struct ubifs_info *c, 389static inline void dbg_dump_leb(const struct ubifs_info *c,
388 int lnum) { return; } 390 int lnum) { return; }
389static inline void 391static inline void
392dbg_dump_sleb(const struct ubifs_info *c,
393 const struct ubifs_scan_leb *sleb, int offs) { return; }
394static inline void
390dbg_dump_znode(const struct ubifs_info *c, 395dbg_dump_znode(const struct ubifs_info *c,
391 const struct ubifs_znode *znode) { return; } 396 const struct ubifs_znode *znode) { return; }
392static inline void dbg_dump_heap(struct ubifs_info *c, 397static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d932..ee4f43f4bb9 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
983} 983}
984 984
985/** 985/**
986 * clean_an_unclean_leb - read and write a LEB to remove corruption. 986 * clean_an_unclean_leb - read and write a LEB to remove corruption.
987 * @c: UBIFS file-system description object 987 * @c: UBIFS file-system description object
988 * @ucleb: unclean LEB information 988 * @ucleb: unclean LEB information
989 * @sbuf: LEB-sized buffer to use 989 * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2..6094c5a5d7a 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
247 mst->total_dirty = cpu_to_le64(tmp64); 247 mst->total_dirty = cpu_to_le64(tmp64);
248 248
249 /* The indexing LEB does not contribute to dark space */ 249 /* The indexing LEB does not contribute to dark space */
250 tmp64 = (c->main_lebs - 1) * c->dark_wm; 250 tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
251 mst->total_dark = cpu_to_le64(tmp64); 251 mst->total_dark = cpu_to_le64(tmp64);
252 252
253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); 253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b28121278d4..20403dc5d43 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
129 goto out_ino; 129 goto out_ino;
130 130
131 inode->i_flags |= (S_NOCMTIME | S_NOATIME); 131 inode->i_flags |= (S_NOCMTIME | S_NOATIME);
132 inode->i_nlink = le32_to_cpu(ino->nlink); 132 set_nlink(inode, le32_to_cpu(ino->nlink));
133 inode->i_uid = le32_to_cpu(ino->uid); 133 inode->i_uid = le32_to_cpu(ino->uid);
134 inode->i_gid = le32_to_cpu(ino->gid); 134 inode->i_gid = le32_to_cpu(ino->gid);
135 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); 135 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 16f19f55e63..bf18f7a0454 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
558 } 558 }
559 559
560 ubifs_assert(inode->i_nlink == 1); 560 ubifs_assert(inode->i_nlink == 1);
561 inode->i_nlink = 0; 561 clear_nlink(inode);
562 err = remove_xattr(c, host, inode, &nm); 562 err = remove_xattr(c, host, inode, &nm);
563 if (err) 563 if (err)
564 inode->i_nlink = 1; 564 set_nlink(inode, 1);
565 565
566 /* If @i_nlink is 0, 'iput()' will delete the inode */ 566 /* If @i_nlink is 0, 'iput()' will delete the inode */
567 iput(inode); 567 iput(inode);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 95518a9f589..987585bb0a1 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb,
59 int nr_groups = bitmap->s_nr_groups; 59 int nr_groups = bitmap->s_nr_groups;
60 60
61 if (block_group >= nr_groups) { 61 if (block_group >= nr_groups) {
62 udf_debug("block_group (%d) > nr_groups (%d)\n", block_group, 62 udf_debug("block_group (%d) > nr_groups (%d)\n",
63 nr_groups); 63 block_group, nr_groups);
64 } 64 }
65 65
66 if (bitmap->s_block_bitmap[block_group]) { 66 if (bitmap->s_block_bitmap[block_group]) {
@@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
126 if (bloc->logicalBlockNum + count < count || 126 if (bloc->logicalBlockNum + count < count ||
127 (bloc->logicalBlockNum + count) > partmap->s_partition_len) { 127 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
128 udf_debug("%d < %d || %d + %d > %d\n", 128 udf_debug("%d < %d || %d + %d > %d\n",
129 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, 129 bloc->logicalBlockNum, 0,
130 count, partmap->s_partition_len); 130 bloc->logicalBlockNum, count,
131 partmap->s_partition_len);
131 goto error_return; 132 goto error_return;
132 } 133 }
133 134
@@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
155 if (udf_set_bit(bit + i, bh->b_data)) { 156 if (udf_set_bit(bit + i, bh->b_data)) {
156 udf_debug("bit %ld already set\n", bit + i); 157 udf_debug("bit %ld already set\n", bit + i);
157 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
158 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
159 } 160 }
160 } 161 }
161 udf_add_free_space(sb, sbi->s_partition, count); 162 udf_add_free_space(sb, sbi->s_partition, count);
@@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb,
369 if (bloc->logicalBlockNum + count < count || 370 if (bloc->logicalBlockNum + count < count ||
370 (bloc->logicalBlockNum + count) > partmap->s_partition_len) { 371 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
371 udf_debug("%d < %d || %d + %d > %d\n", 372 udf_debug("%d < %d || %d + %d > %d\n",
372 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, 373 bloc->logicalBlockNum, 0,
374 bloc->logicalBlockNum, count,
373 partmap->s_partition_len); 375 partmap->s_partition_len);
374 goto error_return; 376 goto error_return;
375 } 377 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2ffdb6733af..3e44f575fb9 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
162 int padlen; 162 int padlen;
163 163
164 if ((!buffer) || (!offset)) { 164 if ((!buffer) || (!offset)) {
165 udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer, 165 udf_debug("invalidparms, buffer=%p, offset=%p\n",
166 offset); 166 buffer, offset);
167 return NULL; 167 return NULL;
168 } 168 }
169 169
@@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs
201 struct short_ad *sa; 201 struct short_ad *sa;
202 202
203 if ((!ptr) || (!offset)) { 203 if ((!ptr) || (!offset)) {
204 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); 204 pr_err("%s: invalidparms\n", __func__);
205 return NULL; 205 return NULL;
206 } 206 }
207 207
@@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset
223 struct long_ad *la; 223 struct long_ad *la;
224 224
225 if ((!ptr) || (!offset)) { 225 if ((!ptr) || (!offset)) {
226 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); 226 pr_err("%s: invalidparms\n", __func__);
227 return NULL; 227 return NULL;
228 } 228 }
229 229
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d1358ed80c..4fd1d809738 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
37#include <linux/writeback.h> 37#include <linux/writeback.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/crc-itu-t.h> 39#include <linux/crc-itu-t.h>
40#include <linux/mpage.h>
40 41
41#include "udf_i.h" 42#include "udf_i.h"
42#include "udf_sb.h" 43#include "udf_sb.h"
@@ -83,12 +84,10 @@ void udf_evict_inode(struct inode *inode)
83 end_writeback(inode); 84 end_writeback(inode);
84 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 85 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
85 inode->i_size != iinfo->i_lenExtents) { 86 inode->i_size != iinfo->i_lenExtents) {
86 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 87 udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
87 "inode size %llu different from extent length %llu. " 88 inode->i_ino, inode->i_mode,
88 "Filesystem need not be standards compliant.\n", 89 (unsigned long long)inode->i_size,
89 inode->i_sb->s_id, inode->i_ino, inode->i_mode, 90 (unsigned long long)iinfo->i_lenExtents);
90 (unsigned long long)inode->i_size,
91 (unsigned long long)iinfo->i_lenExtents);
92 } 91 }
93 kfree(iinfo->i_ext.i_data); 92 kfree(iinfo->i_ext.i_data);
94 iinfo->i_ext.i_data = NULL; 93 iinfo->i_ext.i_data = NULL;
@@ -104,7 +103,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc)
104 103
105static int udf_readpage(struct file *file, struct page *page) 104static int udf_readpage(struct file *file, struct page *page)
106{ 105{
107 return block_read_full_page(page, udf_get_block); 106 return mpage_readpage(page, udf_get_block);
107}
108
109static int udf_readpages(struct file *file, struct address_space *mapping,
110 struct list_head *pages, unsigned nr_pages)
111{
112 return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
108} 113}
109 114
110static int udf_write_begin(struct file *file, struct address_space *mapping, 115static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -139,6 +144,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
139 144
140const struct address_space_operations udf_aops = { 145const struct address_space_operations udf_aops = {
141 .readpage = udf_readpage, 146 .readpage = udf_readpage,
147 .readpages = udf_readpages,
142 .writepage = udf_writepage, 148 .writepage = udf_writepage,
143 .write_begin = udf_write_begin, 149 .write_begin = udf_write_begin,
144 .write_end = generic_write_end, 150 .write_end = generic_write_end,
@@ -1169,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode)
1169 */ 1175 */
1170 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); 1176 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
1171 if (!bh) { 1177 if (!bh) {
1172 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", 1178 udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
1173 inode->i_ino);
1174 make_bad_inode(inode); 1179 make_bad_inode(inode);
1175 return; 1180 return;
1176 } 1181 }
1177 1182
1178 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && 1183 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
1179 ident != TAG_IDENT_USE) { 1184 ident != TAG_IDENT_USE) {
1180 printk(KERN_ERR "udf: udf_read_inode(ino %ld) " 1185 udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
1181 "failed ident=%d\n", inode->i_ino, ident); 1186 inode->i_ino, ident);
1182 brelse(bh); 1187 brelse(bh);
1183 make_bad_inode(inode); 1188 make_bad_inode(inode);
1184 return; 1189 return;
@@ -1218,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode)
1218 } 1223 }
1219 brelse(ibh); 1224 brelse(ibh);
1220 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { 1225 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
1221 printk(KERN_ERR "udf: unsupported strategy type: %d\n", 1226 udf_err(inode->i_sb, "unsupported strategy type: %d\n",
1222 le16_to_cpu(fe->icbTag.strategyType)); 1227 le16_to_cpu(fe->icbTag.strategyType));
1223 brelse(bh); 1228 brelse(bh);
1224 make_bad_inode(inode); 1229 make_bad_inode(inode);
1225 return; 1230 return;
@@ -1236,6 +1241,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1236 int offset; 1241 int offset;
1237 struct udf_sb_info *sbi = UDF_SB(inode->i_sb); 1242 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1238 struct udf_inode_info *iinfo = UDF_I(inode); 1243 struct udf_inode_info *iinfo = UDF_I(inode);
1244 unsigned int link_count;
1239 1245
1240 fe = (struct fileEntry *)bh->b_data; 1246 fe = (struct fileEntry *)bh->b_data;
1241 efe = (struct extendedFileEntry *)bh->b_data; 1247 efe = (struct extendedFileEntry *)bh->b_data;
@@ -1318,9 +1324,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1318 inode->i_mode &= ~sbi->s_umask; 1324 inode->i_mode &= ~sbi->s_umask;
1319 read_unlock(&sbi->s_cred_lock); 1325 read_unlock(&sbi->s_cred_lock);
1320 1326
1321 inode->i_nlink = le16_to_cpu(fe->fileLinkCount); 1327 link_count = le16_to_cpu(fe->fileLinkCount);
1322 if (!inode->i_nlink) 1328 if (!link_count)
1323 inode->i_nlink = 1; 1329 link_count = 1;
1330 set_nlink(inode, link_count);
1324 1331
1325 inode->i_size = le64_to_cpu(fe->informationLength); 1332 inode->i_size = le64_to_cpu(fe->informationLength);
1326 iinfo->i_lenExtents = inode->i_size; 1333 iinfo->i_lenExtents = inode->i_size;
@@ -1413,9 +1420,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1413 udf_debug("METADATA BITMAP FILE-----\n"); 1420 udf_debug("METADATA BITMAP FILE-----\n");
1414 break; 1421 break;
1415 default: 1422 default:
1416 printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " 1423 udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
1417 "file type=%d\n", inode->i_ino, 1424 inode->i_ino, fe->icbTag.fileType);
1418 fe->icbTag.fileType);
1419 make_bad_inode(inode); 1425 make_bad_inode(inode);
1420 return; 1426 return;
1421 } 1427 }
@@ -1438,8 +1444,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
1438 iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); 1444 iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
1439 1445
1440 if (!iinfo->i_ext.i_data) { 1446 if (!iinfo->i_ext.i_data) {
1441 printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) " 1447 udf_err(inode->i_sb, "(ino %ld) no free memory\n",
1442 "no free memory\n", inode->i_ino); 1448 inode->i_ino);
1443 return -ENOMEM; 1449 return -ENOMEM;
1444 } 1450 }
1445 1451
@@ -1689,9 +1695,8 @@ out:
1689 if (do_sync) { 1695 if (do_sync) {
1690 sync_dirty_buffer(bh); 1696 sync_dirty_buffer(bh);
1691 if (buffer_write_io_error(bh)) { 1697 if (buffer_write_io_error(bh)) {
1692 printk(KERN_WARNING "IO error syncing udf inode " 1698 udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
1693 "[%s:%08lx]\n", inode->i_sb->s_id, 1699 inode->i_ino);
1694 inode->i_ino);
1695 err = -EIO; 1700 err = -EIO;
1696 } 1701 }
1697 } 1702 }
@@ -1982,8 +1987,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1982 *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; 1987 *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
1983 break; 1988 break;
1984 default: 1989 default:
1985 udf_debug("alloc_type = %d unsupported\n", 1990 udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
1986 iinfo->i_alloc_type);
1987 return -1; 1991 return -1;
1988 } 1992 }
1989 1993
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 43e24a3b8e1..6583fe9b064 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
38 38
39 if (i == 0) { 39 if (i == 0) {
40 udf_debug("XA disk: %s, vol_desc_start=%d\n", 40 udf_debug("XA disk: %s, vol_desc_start=%d\n",
41 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); 41 ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
42 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 42 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
43 vol_desc_start = ms_info.addr.lba; 43 vol_desc_start = ms_info.addr.lba;
44 } else { 44 } else {
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700c00a..c175b4dabc1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
204{ 204{
205 struct tag *tag_p; 205 struct tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 u8 checksum;
207 208
208 /* Read the block */ 209 /* Read the block */
209 if (block == 0xFFFFFFFF) 210 if (block == 0xFFFFFFFF)
@@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
211 212
212 bh = udf_tread(sb, block); 213 bh = udf_tread(sb, block);
213 if (!bh) { 214 if (!bh) {
214 udf_debug("block=%d, location=%d: read failed\n", 215 udf_err(sb, "read failed, block=%u, location=%d\n",
215 block, location); 216 block, location);
216 return NULL; 217 return NULL;
217 } 218 }
218 219
@@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
227 } 228 }
228 229
229 /* Verify the tag checksum */ 230 /* Verify the tag checksum */
230 if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) { 231 checksum = udf_tag_checksum(tag_p);
231 printk(KERN_ERR "udf: tag checksum failed block %d\n", block); 232 if (checksum != tag_p->tagChecksum) {
233 udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
234 block, checksum, tag_p->tagChecksum);
232 goto error_out; 235 goto error_out;
233 } 236 }
234 237
235 /* Verify the tag version */ 238 /* Verify the tag version */
236 if (tag_p->descVersion != cpu_to_le16(0x0002U) && 239 if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
237 tag_p->descVersion != cpu_to_le16(0x0003U)) { 240 tag_p->descVersion != cpu_to_le16(0x0003U)) {
238 udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n", 241 udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
239 le16_to_cpu(tag_p->descVersion), block); 242 le16_to_cpu(tag_p->descVersion), block);
240 goto error_out; 243 goto error_out;
241 } 244 }
242 245
@@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
248 return bh; 251 return bh;
249 252
250 udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, 253 udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
251 le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); 254 le16_to_cpu(tag_p->descCRC),
252 255 le16_to_cpu(tag_p->descCRCLength));
253error_out: 256error_out:
254 brelse(bh); 257 brelse(bh);
255 return NULL; 258 return NULL;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef9..4639e137222 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
577 577
578 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 578 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
579 if (!fi) { 579 if (!fi) {
580 inode->i_nlink--; 580 inode_dec_link_count(inode);
581 mark_inode_dirty(inode);
582 iput(inode); 581 iput(inode);
583 return err; 582 return err;
584 } 583 }
@@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
618 init_special_inode(inode, mode, rdev); 617 init_special_inode(inode, mode, rdev);
619 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 618 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
620 if (!fi) { 619 if (!fi) {
621 inode->i_nlink--; 620 inode_dec_link_count(inode);
622 mark_inode_dirty(inode);
623 iput(inode); 621 iput(inode);
624 return err; 622 return err;
625 } 623 }
@@ -665,12 +663,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
665 inode->i_fop = &udf_dir_operations; 663 inode->i_fop = &udf_dir_operations;
666 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); 664 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
667 if (!fi) { 665 if (!fi) {
668 inode->i_nlink--; 666 inode_dec_link_count(inode);
669 mark_inode_dirty(inode);
670 iput(inode); 667 iput(inode);
671 goto out; 668 goto out;
672 } 669 }
673 inode->i_nlink = 2; 670 set_nlink(inode, 2);
674 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 671 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
675 cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); 672 cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
676 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 673 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
@@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
683 680
684 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 681 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
685 if (!fi) { 682 if (!fi) {
686 inode->i_nlink = 0; 683 clear_nlink(inode);
687 mark_inode_dirty(inode); 684 mark_inode_dirty(inode);
688 iput(inode); 685 iput(inode);
689 goto out; 686 goto out;
@@ -799,9 +796,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
799 if (retval) 796 if (retval)
800 goto end_rmdir; 797 goto end_rmdir;
801 if (inode->i_nlink != 2) 798 if (inode->i_nlink != 2)
802 udf_warning(inode->i_sb, "udf_rmdir", 799 udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
803 "empty directory has nlink != 2 (%d)", 800 inode->i_nlink);
804 inode->i_nlink);
805 clear_nlink(inode); 801 clear_nlink(inode);
806 inode->i_size = 0; 802 inode->i_size = 0;
807 inode_dec_link_count(dir); 803 inode_dec_link_count(dir);
@@ -840,7 +836,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
840 if (!inode->i_nlink) { 836 if (!inode->i_nlink) {
841 udf_debug("Deleting nonexistent file (%lu), %d\n", 837 udf_debug("Deleting nonexistent file (%lu), %d\n",
842 inode->i_ino, inode->i_nlink); 838 inode->i_ino, inode->i_nlink);
843 inode->i_nlink = 1; 839 set_nlink(inode, 1);
844 } 840 }
845 retval = udf_delete_entry(dir, fi, &fibh, &cfi); 841 retval = udf_delete_entry(dir, fi, &fibh, &cfi);
846 if (retval) 842 if (retval)
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index a71090ea0e0..d6caf01a209 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
33 struct udf_sb_info *sbi = UDF_SB(sb); 33 struct udf_sb_info *sbi = UDF_SB(sb);
34 struct udf_part_map *map; 34 struct udf_part_map *map;
35 if (partition >= sbi->s_partitions) { 35 if (partition >= sbi->s_partitions) {
36 udf_debug("block=%d, partition=%d, offset=%d: " 36 udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
37 "invalid partition\n", block, partition, offset); 37 block, partition, offset);
38 return 0xFFFFFFFF; 38 return 0xFFFFFFFF;
39 } 39 }
40 map = &sbi->s_partmaps[partition]; 40 map = &sbi->s_partmaps[partition];
@@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
60 vdata = &map->s_type_specific.s_virtual; 60 vdata = &map->s_type_specific.s_virtual;
61 61
62 if (block > vdata->s_num_entries) { 62 if (block > vdata->s_num_entries) {
63 udf_debug("Trying to access block beyond end of VAT " 63 udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
64 "(%d max %d)\n", block, vdata->s_num_entries); 64 block, vdata->s_num_entries);
65 return 0xFFFFFFFF; 65 return 0xFFFFFFFF;
66 } 66 }
67 67
@@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
321 /* We shouldn't mount such media... */ 321 /* We shouldn't mount such media... */
322 BUG_ON(!inode); 322 BUG_ON(!inode);
323 retblk = udf_try_read_meta(inode, block, partition, offset); 323 retblk = udf_try_read_meta(inode, block, partition, offset);
324 if (retblk == 0xFFFFFFFF) { 324 if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
325 udf_warning(sb, __func__, "error reading from METADATA, " 325 udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
326 "trying to read from MIRROR"); 326 if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
327 mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
328 mdata->s_mirror_file_loc, map->s_partition_num);
329 mdata->s_flags |= MF_MIRROR_FE_LOADED;
330 }
331
327 inode = mdata->s_mirror_fe; 332 inode = mdata->s_mirror_fe;
328 if (!inode) 333 if (!inode)
329 return 0xFFFFFFFF; 334 return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7b27b063ff6..e185253470d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -75,8 +75,6 @@
75 75
76#define UDF_DEFAULT_BLOCKSIZE 2048 76#define UDF_DEFAULT_BLOCKSIZE 2048
77 77
78static char error_buf[1024];
79
80/* These are the "meat" - everything else is stuffing */ 78/* These are the "meat" - everything else is stuffing */
81static int udf_fill_super(struct super_block *, void *, int); 79static int udf_fill_super(struct super_block *, void *, int);
82static void udf_put_super(struct super_block *); 80static void udf_put_super(struct super_block *);
@@ -92,8 +90,6 @@ static void udf_close_lvid(struct super_block *);
92static unsigned int udf_count_free(struct super_block *); 90static unsigned int udf_count_free(struct super_block *);
93static int udf_statfs(struct dentry *, struct kstatfs *); 91static int udf_statfs(struct dentry *, struct kstatfs *);
94static int udf_show_options(struct seq_file *, struct vfsmount *); 92static int udf_show_options(struct seq_file *, struct vfsmount *);
95static void udf_error(struct super_block *sb, const char *function,
96 const char *fmt, ...);
97 93
98struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) 94struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
99{ 95{
@@ -244,9 +240,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
244 sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), 240 sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
245 GFP_KERNEL); 241 GFP_KERNEL);
246 if (!sbi->s_partmaps) { 242 if (!sbi->s_partmaps) {
247 udf_error(sb, __func__, 243 udf_err(sb, "Unable to allocate space for %d partition maps\n",
248 "Unable to allocate space for %d partition maps", 244 count);
249 count);
250 sbi->s_partitions = 0; 245 sbi->s_partitions = 0;
251 return -ENOMEM; 246 return -ENOMEM;
252 } 247 }
@@ -550,8 +545,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
550 uopt->dmode = option & 0777; 545 uopt->dmode = option & 0777;
551 break; 546 break;
552 default: 547 default:
553 printk(KERN_ERR "udf: bad mount option \"%s\" " 548 pr_err("bad mount option \"%s\" or missing value\n", p);
554 "or missing value\n", p);
555 return 0; 549 return 0;
556 } 550 }
557 } 551 }
@@ -645,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb)
645 udf_debug("ISO9660 Boot Record found\n"); 639 udf_debug("ISO9660 Boot Record found\n");
646 break; 640 break;
647 case 1: 641 case 1:
648 udf_debug("ISO9660 Primary Volume Descriptor " 642 udf_debug("ISO9660 Primary Volume Descriptor found\n");
649 "found\n");
650 break; 643 break;
651 case 2: 644 case 2:
652 udf_debug("ISO9660 Supplementary Volume " 645 udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
653 "Descriptor found\n");
654 break; 646 break;
655 case 3: 647 case 3:
656 udf_debug("ISO9660 Volume Partition Descriptor " 648 udf_debug("ISO9660 Volume Partition Descriptor found\n");
657 "found\n");
658 break; 649 break;
659 case 255: 650 case 255:
660 udf_debug("ISO9660 Volume Descriptor Set " 651 udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
661 "Terminator found\n");
662 break; 652 break;
663 default: 653 default:
664 udf_debug("ISO9660 VRS (%u) found\n", 654 udf_debug("ISO9660 VRS (%u) found\n",
@@ -809,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
809 pvoldesc->recordingDateAndTime)) { 799 pvoldesc->recordingDateAndTime)) {
810#ifdef UDFFS_DEBUG 800#ifdef UDFFS_DEBUG
811 struct timestamp *ts = &pvoldesc->recordingDateAndTime; 801 struct timestamp *ts = &pvoldesc->recordingDateAndTime;
812 udf_debug("recording time %04u/%02u/%02u" 802 udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
813 " %02u:%02u (%x)\n",
814 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, 803 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
815 ts->minute, le16_to_cpu(ts->typeAndTimezone)); 804 ts->minute, le16_to_cpu(ts->typeAndTimezone));
816#endif 805#endif
@@ -821,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
821 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, 810 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
822 outstr->u_len > 31 ? 31 : outstr->u_len); 811 outstr->u_len > 31 ? 31 : outstr->u_len);
823 udf_debug("volIdent[] = '%s'\n", 812 udf_debug("volIdent[] = '%s'\n",
824 UDF_SB(sb)->s_volume_ident); 813 UDF_SB(sb)->s_volume_ident);
825 } 814 }
826 815
827 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) 816 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
@@ -837,64 +826,57 @@ out1:
837 return ret; 826 return ret;
838} 827}
839 828
829struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
830 u32 meta_file_loc, u32 partition_num)
831{
832 struct kernel_lb_addr addr;
833 struct inode *metadata_fe;
834
835 addr.logicalBlockNum = meta_file_loc;
836 addr.partitionReferenceNum = partition_num;
837
838 metadata_fe = udf_iget(sb, &addr);
839
840 if (metadata_fe == NULL)
841 udf_warn(sb, "metadata inode efe not found\n");
842 else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
843 udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
844 iput(metadata_fe);
845 metadata_fe = NULL;
846 }
847
848 return metadata_fe;
849}
850
840static int udf_load_metadata_files(struct super_block *sb, int partition) 851static int udf_load_metadata_files(struct super_block *sb, int partition)
841{ 852{
842 struct udf_sb_info *sbi = UDF_SB(sb); 853 struct udf_sb_info *sbi = UDF_SB(sb);
843 struct udf_part_map *map; 854 struct udf_part_map *map;
844 struct udf_meta_data *mdata; 855 struct udf_meta_data *mdata;
845 struct kernel_lb_addr addr; 856 struct kernel_lb_addr addr;
846 int fe_error = 0;
847 857
848 map = &sbi->s_partmaps[partition]; 858 map = &sbi->s_partmaps[partition];
849 mdata = &map->s_type_specific.s_metadata; 859 mdata = &map->s_type_specific.s_metadata;
850 860
851 /* metadata address */ 861 /* metadata address */
852 addr.logicalBlockNum = mdata->s_meta_file_loc;
853 addr.partitionReferenceNum = map->s_partition_num;
854
855 udf_debug("Metadata file location: block = %d part = %d\n", 862 udf_debug("Metadata file location: block = %d part = %d\n",
856 addr.logicalBlockNum, addr.partitionReferenceNum); 863 mdata->s_meta_file_loc, map->s_partition_num);
857 864
858 mdata->s_metadata_fe = udf_iget(sb, &addr); 865 mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
866 mdata->s_meta_file_loc, map->s_partition_num);
859 867
860 if (mdata->s_metadata_fe == NULL) { 868 if (mdata->s_metadata_fe == NULL) {
861 udf_warning(sb, __func__, "metadata inode efe not found, " 869 /* mirror file entry */
862 "will try mirror inode."); 870 udf_debug("Mirror metadata file location: block = %d part = %d\n",
863 fe_error = 1; 871 mdata->s_mirror_file_loc, map->s_partition_num);
864 } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
865 ICBTAG_FLAG_AD_SHORT) {
866 udf_warning(sb, __func__, "metadata inode efe does not have "
867 "short allocation descriptors!");
868 fe_error = 1;
869 iput(mdata->s_metadata_fe);
870 mdata->s_metadata_fe = NULL;
871 }
872 872
873 /* mirror file entry */ 873 mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
874 addr.logicalBlockNum = mdata->s_mirror_file_loc; 874 mdata->s_mirror_file_loc, map->s_partition_num);
875 addr.partitionReferenceNum = map->s_partition_num;
876
877 udf_debug("Mirror metadata file location: block = %d part = %d\n",
878 addr.logicalBlockNum, addr.partitionReferenceNum);
879 875
880 mdata->s_mirror_fe = udf_iget(sb, &addr); 876 if (mdata->s_mirror_fe == NULL) {
881 877 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
882 if (mdata->s_mirror_fe == NULL) {
883 if (fe_error) {
884 udf_error(sb, __func__, "mirror inode efe not found "
885 "and metadata inode is missing too, exiting...");
886 goto error_exit;
887 } else
888 udf_warning(sb, __func__, "mirror inode efe not found,"
889 " but metadata inode is OK");
890 } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
891 ICBTAG_FLAG_AD_SHORT) {
892 udf_warning(sb, __func__, "mirror inode efe does not have "
893 "short allocation descriptors!");
894 iput(mdata->s_mirror_fe);
895 mdata->s_mirror_fe = NULL;
896 if (fe_error)
897 goto error_exit; 878 goto error_exit;
879 }
898 } 880 }
899 881
900 /* 882 /*
@@ -907,18 +889,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
907 addr.partitionReferenceNum = map->s_partition_num; 889 addr.partitionReferenceNum = map->s_partition_num;
908 890
909 udf_debug("Bitmap file location: block = %d part = %d\n", 891 udf_debug("Bitmap file location: block = %d part = %d\n",
910 addr.logicalBlockNum, addr.partitionReferenceNum); 892 addr.logicalBlockNum, addr.partitionReferenceNum);
911 893
912 mdata->s_bitmap_fe = udf_iget(sb, &addr); 894 mdata->s_bitmap_fe = udf_iget(sb, &addr);
913 895
914 if (mdata->s_bitmap_fe == NULL) { 896 if (mdata->s_bitmap_fe == NULL) {
915 if (sb->s_flags & MS_RDONLY) 897 if (sb->s_flags & MS_RDONLY)
916 udf_warning(sb, __func__, "bitmap inode efe " 898 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
917 "not found but it's ok since the disc"
918 " is mounted read-only");
919 else { 899 else {
920 udf_error(sb, __func__, "bitmap inode efe not " 900 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
921 "found and attempted read-write mount");
922 goto error_exit; 901 goto error_exit;
923 } 902 }
924 } 903 }
@@ -971,9 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
971 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ 950 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
972 951
973 if (bitmap == NULL) { 952 if (bitmap == NULL) {
974 udf_error(sb, __func__, 953 udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
975 "Unable to allocate space for bitmap " 954 nr_groups);
976 "and %d buffer_head pointers", nr_groups);
977 return NULL; 955 return NULL;
978 } 956 }
979 957
@@ -1003,10 +981,9 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1003 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) 981 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
1004 map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; 982 map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
1005 983
1006 udf_debug("Partition (%d type %x) starts at physical %d, " 984 udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
1007 "block length %d\n", p_index, 985 p_index, map->s_partition_type,
1008 map->s_partition_type, map->s_partition_root, 986 map->s_partition_root, map->s_partition_len);
1009 map->s_partition_len);
1010 987
1011 if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && 988 if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
1012 strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) 989 strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
@@ -1023,12 +1000,12 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1023 map->s_uspace.s_table = udf_iget(sb, &loc); 1000 map->s_uspace.s_table = udf_iget(sb, &loc);
1024 if (!map->s_uspace.s_table) { 1001 if (!map->s_uspace.s_table) {
1025 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1002 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1026 p_index); 1003 p_index);
1027 return 1; 1004 return 1;
1028 } 1005 }
1029 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; 1006 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
1030 udf_debug("unallocSpaceTable (part %d) @ %ld\n", 1007 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
1031 p_index, map->s_uspace.s_table->i_ino); 1008 p_index, map->s_uspace.s_table->i_ino);
1032 } 1009 }
1033 1010
1034 if (phd->unallocSpaceBitmap.extLength) { 1011 if (phd->unallocSpaceBitmap.extLength) {
@@ -1041,8 +1018,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1041 bitmap->s_extPosition = le32_to_cpu( 1018 bitmap->s_extPosition = le32_to_cpu(
1042 phd->unallocSpaceBitmap.extPosition); 1019 phd->unallocSpaceBitmap.extPosition);
1043 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; 1020 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
1044 udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, 1021 udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
1045 bitmap->s_extPosition); 1022 p_index, bitmap->s_extPosition);
1046 } 1023 }
1047 1024
1048 if (phd->partitionIntegrityTable.extLength) 1025 if (phd->partitionIntegrityTable.extLength)
@@ -1058,13 +1035,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1058 map->s_fspace.s_table = udf_iget(sb, &loc); 1035 map->s_fspace.s_table = udf_iget(sb, &loc);
1059 if (!map->s_fspace.s_table) { 1036 if (!map->s_fspace.s_table) {
1060 udf_debug("cannot load freedSpaceTable (part %d)\n", 1037 udf_debug("cannot load freedSpaceTable (part %d)\n",
1061 p_index); 1038 p_index);
1062 return 1; 1039 return 1;
1063 } 1040 }
1064 1041
1065 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; 1042 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
1066 udf_debug("freedSpaceTable (part %d) @ %ld\n", 1043 udf_debug("freedSpaceTable (part %d) @ %ld\n",
1067 p_index, map->s_fspace.s_table->i_ino); 1044 p_index, map->s_fspace.s_table->i_ino);
1068 } 1045 }
1069 1046
1070 if (phd->freedSpaceBitmap.extLength) { 1047 if (phd->freedSpaceBitmap.extLength) {
@@ -1077,8 +1054,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1077 bitmap->s_extPosition = le32_to_cpu( 1054 bitmap->s_extPosition = le32_to_cpu(
1078 phd->freedSpaceBitmap.extPosition); 1055 phd->freedSpaceBitmap.extPosition);
1079 map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; 1056 map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
1080 udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, 1057 udf_debug("freedSpaceBitmap (part %d) @ %d\n",
1081 bitmap->s_extPosition); 1058 p_index, bitmap->s_extPosition);
1082 } 1059 }
1083 return 0; 1060 return 0;
1084} 1061}
@@ -1118,11 +1095,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1118 udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); 1095 udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
1119 if (!sbi->s_vat_inode && 1096 if (!sbi->s_vat_inode &&
1120 sbi->s_last_block != blocks - 1) { 1097 sbi->s_last_block != blocks - 1) {
1121 printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" 1098 pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
1122 " last recorded block (%lu), retrying with the last " 1099 (unsigned long)sbi->s_last_block,
1123 "block of the device (%lu).\n", 1100 (unsigned long)blocks - 1);
1124 (unsigned long)sbi->s_last_block,
1125 (unsigned long)blocks - 1);
1126 udf_find_vat_block(sb, p_index, type1_index, blocks - 1); 1101 udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
1127 } 1102 }
1128 if (!sbi->s_vat_inode) 1103 if (!sbi->s_vat_inode)
@@ -1220,8 +1195,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
1220 if (map->s_partition_type == UDF_METADATA_MAP25) { 1195 if (map->s_partition_type == UDF_METADATA_MAP25) {
1221 ret = udf_load_metadata_files(sb, i); 1196 ret = udf_load_metadata_files(sb, i);
1222 if (ret) { 1197 if (ret) {
1223 printk(KERN_ERR "UDF-fs: error loading MetaData " 1198 udf_err(sb, "error loading MetaData partition map %d\n",
1224 "partition map %d\n", i); 1199 i);
1225 goto out_bh; 1200 goto out_bh;
1226 } 1201 }
1227 } else { 1202 } else {
@@ -1234,9 +1209,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
1234 * overwrite blocks instead of relocating them). 1209 * overwrite blocks instead of relocating them).
1235 */ 1210 */
1236 sb->s_flags |= MS_RDONLY; 1211 sb->s_flags |= MS_RDONLY;
1237 printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " 1212 pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
1238 "because writing to pseudooverwrite partition is "
1239 "not implemented.\n");
1240 } 1213 }
1241out_bh: 1214out_bh:
1242 /* In case loading failed, we handle cleanup in udf_fill_super */ 1215 /* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1344,9 +1317,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1344 struct metadataPartitionMap *mdm = 1317 struct metadataPartitionMap *mdm =
1345 (struct metadataPartitionMap *) 1318 (struct metadataPartitionMap *)
1346 &(lvd->partitionMaps[offset]); 1319 &(lvd->partitionMaps[offset]);
1347 udf_debug("Parsing Logical vol part %d " 1320 udf_debug("Parsing Logical vol part %d type %d id=%s\n",
1348 "type %d id=%s\n", i, type, 1321 i, type, UDF_ID_METADATA);
1349 UDF_ID_METADATA);
1350 1322
1351 map->s_partition_type = UDF_METADATA_MAP25; 1323 map->s_partition_type = UDF_METADATA_MAP25;
1352 map->s_partition_func = udf_get_pblock_meta25; 1324 map->s_partition_func = udf_get_pblock_meta25;
@@ -1361,25 +1333,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1361 le32_to_cpu(mdm->allocUnitSize); 1333 le32_to_cpu(mdm->allocUnitSize);
1362 mdata->s_align_unit_size = 1334 mdata->s_align_unit_size =
1363 le16_to_cpu(mdm->alignUnitSize); 1335 le16_to_cpu(mdm->alignUnitSize);
1364 mdata->s_dup_md_flag = 1336 if (mdm->flags & 0x01)
1365 mdm->flags & 0x01; 1337 mdata->s_flags |= MF_DUPLICATE_MD;
1366 1338
1367 udf_debug("Metadata Ident suffix=0x%x\n", 1339 udf_debug("Metadata Ident suffix=0x%x\n",
1368 (le16_to_cpu( 1340 le16_to_cpu(*(__le16 *)
1369 ((__le16 *) 1341 mdm->partIdent.identSuffix));
1370 mdm->partIdent.identSuffix)[0])));
1371 udf_debug("Metadata part num=%d\n", 1342 udf_debug("Metadata part num=%d\n",
1372 le16_to_cpu(mdm->partitionNum)); 1343 le16_to_cpu(mdm->partitionNum));
1373 udf_debug("Metadata part alloc unit size=%d\n", 1344 udf_debug("Metadata part alloc unit size=%d\n",
1374 le32_to_cpu(mdm->allocUnitSize)); 1345 le32_to_cpu(mdm->allocUnitSize));
1375 udf_debug("Metadata file loc=%d\n", 1346 udf_debug("Metadata file loc=%d\n",
1376 le32_to_cpu(mdm->metadataFileLoc)); 1347 le32_to_cpu(mdm->metadataFileLoc));
1377 udf_debug("Mirror file loc=%d\n", 1348 udf_debug("Mirror file loc=%d\n",
1378 le32_to_cpu(mdm->metadataMirrorFileLoc)); 1349 le32_to_cpu(mdm->metadataMirrorFileLoc));
1379 udf_debug("Bitmap file loc=%d\n", 1350 udf_debug("Bitmap file loc=%d\n",
1380 le32_to_cpu(mdm->metadataBitmapFileLoc)); 1351 le32_to_cpu(mdm->metadataBitmapFileLoc));
1381 udf_debug("Duplicate Flag: %d %d\n", 1352 udf_debug("Flags: %d %d\n",
1382 mdata->s_dup_md_flag, mdm->flags); 1353 mdata->s_flags, mdm->flags);
1383 } else { 1354 } else {
1384 udf_debug("Unknown ident: %s\n", 1355 udf_debug("Unknown ident: %s\n",
1385 upm2->partIdent.ident); 1356 upm2->partIdent.ident);
@@ -1389,16 +1360,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1389 map->s_partition_num = le16_to_cpu(upm2->partitionNum); 1360 map->s_partition_num = le16_to_cpu(upm2->partitionNum);
1390 } 1361 }
1391 udf_debug("Partition (%d:%d) type %d on volume %d\n", 1362 udf_debug("Partition (%d:%d) type %d on volume %d\n",
1392 i, map->s_partition_num, type, 1363 i, map->s_partition_num, type, map->s_volumeseqnum);
1393 map->s_volumeseqnum);
1394 } 1364 }
1395 1365
1396 if (fileset) { 1366 if (fileset) {
1397 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); 1367 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
1398 1368
1399 *fileset = lelb_to_cpu(la->extLocation); 1369 *fileset = lelb_to_cpu(la->extLocation);
1400 udf_debug("FileSet found in LogicalVolDesc at block=%d, " 1370 udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
1401 "partition=%d\n", fileset->logicalBlockNum, 1371 fileset->logicalBlockNum,
1402 fileset->partitionReferenceNum); 1372 fileset->partitionReferenceNum);
1403 } 1373 }
1404 if (lvd->integritySeqExt.extLength) 1374 if (lvd->integritySeqExt.extLength)
@@ -1478,9 +1448,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1478 1448
1479 bh = udf_read_tagged(sb, block, block, &ident); 1449 bh = udf_read_tagged(sb, block, block, &ident);
1480 if (!bh) { 1450 if (!bh) {
1481 printk(KERN_ERR "udf: Block %Lu of volume descriptor " 1451 udf_err(sb,
1482 "sequence is corrupted or we could not read " 1452 "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
1483 "it.\n", (unsigned long long)block); 1453 (unsigned long long)block);
1484 return 1; 1454 return 1;
1485 } 1455 }
1486 1456
@@ -1553,7 +1523,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1553 * in a suitable order 1523 * in a suitable order
1554 */ 1524 */
1555 if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { 1525 if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
1556 printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); 1526 udf_err(sb, "Primary Volume Descriptor not found!\n");
1557 return 1; 1527 return 1;
1558 } 1528 }
1559 if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) 1529 if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
@@ -1740,7 +1710,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1740 1710
1741 if (!sb_set_blocksize(sb, uopt->blocksize)) { 1711 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1742 if (!silent) 1712 if (!silent)
1743 printk(KERN_WARNING "UDF-fs: Bad block size\n"); 1713 udf_warn(sb, "Bad block size\n");
1744 return 0; 1714 return 0;
1745 } 1715 }
1746 sbi->s_last_block = uopt->lastblock; 1716 sbi->s_last_block = uopt->lastblock;
@@ -1749,12 +1719,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1749 nsr_off = udf_check_vsd(sb); 1719 nsr_off = udf_check_vsd(sb);
1750 if (!nsr_off) { 1720 if (!nsr_off) {
1751 if (!silent) 1721 if (!silent)
1752 printk(KERN_WARNING "UDF-fs: No VRS found\n"); 1722 udf_warn(sb, "No VRS found\n");
1753 return 0; 1723 return 0;
1754 } 1724 }
1755 if (nsr_off == -1) 1725 if (nsr_off == -1)
1756 udf_debug("Failed to read byte 32768. Assuming open " 1726 udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
1757 "disc. Skipping validity check\n");
1758 if (!sbi->s_last_block) 1727 if (!sbi->s_last_block)
1759 sbi->s_last_block = udf_get_last_block(sb); 1728 sbi->s_last_block = udf_get_last_block(sb);
1760 } else { 1729 } else {
@@ -1765,7 +1734,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1765 sbi->s_anchor = uopt->anchor; 1734 sbi->s_anchor = uopt->anchor;
1766 if (!udf_find_anchor(sb, fileset)) { 1735 if (!udf_find_anchor(sb, fileset)) {
1767 if (!silent) 1736 if (!silent)
1768 printk(KERN_WARNING "UDF-fs: No anchor found\n"); 1737 udf_warn(sb, "No anchor found\n");
1769 return 0; 1738 return 0;
1770 } 1739 }
1771 return 1; 1740 return 1;
@@ -1937,8 +1906,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1937 1906
1938 if (uopt.flags & (1 << UDF_FLAG_UTF8) && 1907 if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
1939 uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { 1908 uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
1940 udf_error(sb, "udf_read_super", 1909 udf_err(sb, "utf8 cannot be combined with iocharset\n");
1941 "utf8 cannot be combined with iocharset\n");
1942 goto error_out; 1910 goto error_out;
1943 } 1911 }
1944#ifdef CONFIG_UDF_NLS 1912#ifdef CONFIG_UDF_NLS
@@ -1987,15 +1955,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1987 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1955 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1988 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { 1956 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1989 if (!silent) 1957 if (!silent)
1990 printk(KERN_NOTICE 1958 pr_notice("Rescanning with blocksize %d\n",
1991 "UDF-fs: Rescanning with blocksize " 1959 UDF_DEFAULT_BLOCKSIZE);
1992 "%d\n", UDF_DEFAULT_BLOCKSIZE);
1993 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; 1960 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
1994 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1961 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1995 } 1962 }
1996 } 1963 }
1997 if (!ret) { 1964 if (!ret) {
1998 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1965 udf_warn(sb, "No partition found (1)\n");
1999 goto error_out; 1966 goto error_out;
2000 } 1967 }
2001 1968
@@ -2010,10 +1977,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2010 le16_to_cpu(lvidiu->maxUDFWriteRev); */ 1977 le16_to_cpu(lvidiu->maxUDFWriteRev); */
2011 1978
2012 if (minUDFReadRev > UDF_MAX_READ_VERSION) { 1979 if (minUDFReadRev > UDF_MAX_READ_VERSION) {
2013 printk(KERN_ERR "UDF-fs: minUDFReadRev=%x " 1980 udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
2014 "(max is %x)\n", 1981 le16_to_cpu(lvidiu->minUDFReadRev),
2015 le16_to_cpu(lvidiu->minUDFReadRev), 1982 UDF_MAX_READ_VERSION);
2016 UDF_MAX_READ_VERSION);
2017 goto error_out; 1983 goto error_out;
2018 } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) 1984 } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
2019 sb->s_flags |= MS_RDONLY; 1985 sb->s_flags |= MS_RDONLY;
@@ -2027,28 +1993,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2027 } 1993 }
2028 1994
2029 if (!sbi->s_partitions) { 1995 if (!sbi->s_partitions) {
2030 printk(KERN_WARNING "UDF-fs: No partition found (2)\n"); 1996 udf_warn(sb, "No partition found (2)\n");
2031 goto error_out; 1997 goto error_out;
2032 } 1998 }
2033 1999
2034 if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & 2000 if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
2035 UDF_PART_FLAG_READ_ONLY) { 2001 UDF_PART_FLAG_READ_ONLY) {
2036 printk(KERN_NOTICE "UDF-fs: Partition marked readonly; " 2002 pr_notice("Partition marked readonly; forcing readonly mount\n");
2037 "forcing readonly mount\n");
2038 sb->s_flags |= MS_RDONLY; 2003 sb->s_flags |= MS_RDONLY;
2039 } 2004 }
2040 2005
2041 if (udf_find_fileset(sb, &fileset, &rootdir)) { 2006 if (udf_find_fileset(sb, &fileset, &rootdir)) {
2042 printk(KERN_WARNING "UDF-fs: No fileset found\n"); 2007 udf_warn(sb, "No fileset found\n");
2043 goto error_out; 2008 goto error_out;
2044 } 2009 }
2045 2010
2046 if (!silent) { 2011 if (!silent) {
2047 struct timestamp ts; 2012 struct timestamp ts;
2048 udf_time_to_disk_stamp(&ts, sbi->s_record_time); 2013 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
2049 udf_info("UDF: Mounting volume '%s', " 2014 udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
2050 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 2015 sbi->s_volume_ident,
2051 sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, 2016 le16_to_cpu(ts.year), ts.month, ts.day,
2052 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); 2017 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
2053 } 2018 }
2054 if (!(sb->s_flags & MS_RDONLY)) 2019 if (!(sb->s_flags & MS_RDONLY))
@@ -2059,8 +2024,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2059 /* perhaps it's not extensible enough, but for now ... */ 2024 /* perhaps it's not extensible enough, but for now ... */
2060 inode = udf_iget(sb, &rootdir); 2025 inode = udf_iget(sb, &rootdir);
2061 if (!inode) { 2026 if (!inode) {
2062 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " 2027 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
2063 "partition=%d\n",
2064 rootdir.logicalBlockNum, rootdir.partitionReferenceNum); 2028 rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
2065 goto error_out; 2029 goto error_out;
2066 } 2030 }
@@ -2068,7 +2032,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2068 /* Allocate a dentry for the root inode */ 2032 /* Allocate a dentry for the root inode */
2069 sb->s_root = d_alloc_root(inode); 2033 sb->s_root = d_alloc_root(inode);
2070 if (!sb->s_root) { 2034 if (!sb->s_root) {
2071 printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n"); 2035 udf_err(sb, "Couldn't allocate root dentry\n");
2072 iput(inode); 2036 iput(inode);
2073 goto error_out; 2037 goto error_out;
2074 } 2038 }
@@ -2096,32 +2060,40 @@ error_out:
2096 return -EINVAL; 2060 return -EINVAL;
2097} 2061}
2098 2062
2099static void udf_error(struct super_block *sb, const char *function, 2063void _udf_err(struct super_block *sb, const char *function,
2100 const char *fmt, ...) 2064 const char *fmt, ...)
2101{ 2065{
2066 struct va_format vaf;
2102 va_list args; 2067 va_list args;
2103 2068
2104 if (!(sb->s_flags & MS_RDONLY)) { 2069 /* mark sb error */
2105 /* mark sb error */ 2070 if (!(sb->s_flags & MS_RDONLY))
2106 sb->s_dirt = 1; 2071 sb->s_dirt = 1;
2107 } 2072
2108 va_start(args, fmt); 2073 va_start(args, fmt);
2109 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 2074
2075 vaf.fmt = fmt;
2076 vaf.va = &args;
2077
2078 pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
2079
2110 va_end(args); 2080 va_end(args);
2111 printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
2112 sb->s_id, function, error_buf);
2113} 2081}
2114 2082
2115void udf_warning(struct super_block *sb, const char *function, 2083void _udf_warn(struct super_block *sb, const char *function,
2116 const char *fmt, ...) 2084 const char *fmt, ...)
2117{ 2085{
2086 struct va_format vaf;
2118 va_list args; 2087 va_list args;
2119 2088
2120 va_start(args, fmt); 2089 va_start(args, fmt);
2121 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 2090
2091 vaf.fmt = fmt;
2092 vaf.va = &args;
2093
2094 pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
2095
2122 va_end(args); 2096 va_end(args);
2123 printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
2124 sb->s_id, function, error_buf);
2125} 2097}
2126 2098
2127static void udf_put_super(struct super_block *sb) 2099static void udf_put_super(struct super_block *sb)
@@ -2213,11 +2185,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2213 bh = udf_read_ptagged(sb, &loc, 0, &ident); 2185 bh = udf_read_ptagged(sb, &loc, 0, &ident);
2214 2186
2215 if (!bh) { 2187 if (!bh) {
2216 printk(KERN_ERR "udf: udf_count_free failed\n"); 2188 udf_err(sb, "udf_count_free failed\n");
2217 goto out; 2189 goto out;
2218 } else if (ident != TAG_IDENT_SBD) { 2190 } else if (ident != TAG_IDENT_SBD) {
2219 brelse(bh); 2191 brelse(bh);
2220 printk(KERN_ERR "udf: udf_count_free failed\n"); 2192 udf_err(sb, "udf_count_free failed\n");
2221 goto out; 2193 goto out;
2222 } 2194 }
2223 2195
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8424308db4b..4b98fee8e16 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode)
95 lbcount += elen; 95 lbcount += elen;
96 if (lbcount > inode->i_size) { 96 if (lbcount > inode->i_size) {
97 if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) 97 if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
98 printk(KERN_WARNING 98 udf_warn(inode->i_sb,
99 "udf_truncate_tail_extent(): Too long " 99 "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
100 "extent after EOF in inode %u: i_size: " 100 (unsigned)inode->i_ino,
101 "%Ld lbcount: %Ld extent %u+%u\n", 101 (long long)inode->i_size,
102 (unsigned)inode->i_ino, 102 (long long)lbcount,
103 (long long)inode->i_size, 103 (unsigned)eloc.logicalBlockNum,
104 (long long)lbcount, 104 (unsigned)elen);
105 (unsigned)eloc.logicalBlockNum,
106 (unsigned)elen);
107 nelen = elen - (lbcount - inode->i_size); 105 nelen = elen - (lbcount - inode->i_size);
108 epos.offset -= adsize; 106 epos.offset -= adsize;
109 extent_trunc(inode, &epos, &eloc, etype, elen, nelen); 107 extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
110 epos.offset += adsize; 108 epos.offset += adsize;
111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) 109 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
112 printk(KERN_ERR "udf_truncate_tail_extent(): " 110 udf_err(inode->i_sb,
113 "Extent after EOF in inode %u.\n", 111 "Extent after EOF in inode %u\n",
114 (unsigned)inode->i_ino); 112 (unsigned)inode->i_ino);
115 break; 113 break;
116 } 114 }
117 } 115 }
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4858c191242..5142a82e327 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -54,13 +54,16 @@
54 54
55#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ 55#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
56 56
57#define MF_DUPLICATE_MD 0x01
58#define MF_MIRROR_FE_LOADED 0x02
59
57struct udf_meta_data { 60struct udf_meta_data {
58 __u32 s_meta_file_loc; 61 __u32 s_meta_file_loc;
59 __u32 s_mirror_file_loc; 62 __u32 s_mirror_file_loc;
60 __u32 s_bitmap_file_loc; 63 __u32 s_bitmap_file_loc;
61 __u32 s_alloc_unit_size; 64 __u32 s_alloc_unit_size;
62 __u16 s_align_unit_size; 65 __u16 s_align_unit_size;
63 __u8 s_dup_md_flag; 66 int s_flags;
64 struct inode *s_metadata_fe; 67 struct inode *s_metadata_fe;
65 struct inode *s_mirror_fe; 68 struct inode *s_mirror_fe;
66 struct inode *s_bitmap_fe; 69 struct inode *s_bitmap_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b5ee..f34e6fc0cda 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,6 +1,8 @@
1#ifndef __UDF_DECL_H 1#ifndef __UDF_DECL_H
2#define __UDF_DECL_H 2#define __UDF_DECL_H
3 3
4#define pr_fmt(fmt) "UDF-fs: " fmt
5
4#include "ecma_167.h" 6#include "ecma_167.h"
5#include "osta_udf.h" 7#include "osta_udf.h"
6 8
@@ -16,23 +18,30 @@
16#define UDF_PREALLOCATE 18#define UDF_PREALLOCATE
17#define UDF_DEFAULT_PREALLOC_BLOCKS 8 19#define UDF_DEFAULT_PREALLOC_BLOCKS 8
18 20
21extern __printf(3, 4) void _udf_err(struct super_block *sb,
22 const char *function, const char *fmt, ...);
23#define udf_err(sb, fmt, ...) \
24 _udf_err(sb, __func__, fmt, ##__VA_ARGS__)
25
26extern __printf(3, 4) void _udf_warn(struct super_block *sb,
27 const char *function, const char *fmt, ...);
28#define udf_warn(sb, fmt, ...) \
29 _udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
30
31#define udf_info(fmt, ...) \
32 pr_info("INFO " fmt, ##__VA_ARGS__)
33
19#undef UDFFS_DEBUG 34#undef UDFFS_DEBUG
20 35
21#ifdef UDFFS_DEBUG 36#ifdef UDFFS_DEBUG
22#define udf_debug(f, a...) \ 37#define udf_debug(fmt, ...) \
23do { \ 38 printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \
24 printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ 39 __FILE__, __LINE__, __func__, ##__VA_ARGS__)
25 __FILE__, __LINE__, __func__); \
26 printk(f, ##a); \
27} while (0)
28#else 40#else
29#define udf_debug(f, a...) /**/ 41#define udf_debug(fmt, ...) \
42 no_printk(fmt, ##__VA_ARGS__)
30#endif 43#endif
31 44
32#define udf_info(f, a...) \
33 printk(KERN_INFO "UDF-fs INFO " f, ##a);
34
35
36#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) 45#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
37#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) 46#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
38 47
@@ -112,8 +121,6 @@ struct extent_position {
112 121
113/* super.c */ 122/* super.c */
114 123
115__attribute__((format(printf, 3, 4)))
116extern void udf_warning(struct super_block *, const char *, const char *, ...);
117static inline void udf_updated_lvid(struct super_block *sb) 124static inline void udf_updated_lvid(struct super_block *sb)
118{ 125{
119 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; 126 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
@@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb)
126 UDF_SB(sb)->s_lvid_dirty = 1; 133 UDF_SB(sb)->s_lvid_dirty = 1;
127} 134}
128extern u64 lvid_get_unique_id(struct super_block *sb); 135extern u64 lvid_get_unique_id(struct super_block *sb);
136struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
137 u32 meta_file_loc, u32 partition_num);
129 138
130/* namei.c */ 139/* namei.c */
131extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 140extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index b8c828c4d20..1f11483eba6 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -34,9 +34,10 @@
34 * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm 34 * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
35 */ 35 */
36 36
37#include "udfdecl.h"
38
37#include <linux/types.h> 39#include <linux/types.h>
38#include <linux/kernel.h> 40#include <linux/kernel.h>
39#include "udfdecl.h"
40 41
41#define EPOCH_YEAR 1970 42#define EPOCH_YEAR 1970
42 43
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index d03a90b6ad6..44b815e57f9 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
114 cmp_id = ocu_i->u_cmpID; 114 cmp_id = ocu_i->u_cmpID;
115 if (cmp_id != 8 && cmp_id != 16) { 115 if (cmp_id != 8 && cmp_id != 16) {
116 memset(utf_o, 0, sizeof(struct ustr)); 116 memset(utf_o, 0, sizeof(struct ustr));
117 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", 117 pr_err("unknown compression code (%d) stri=%s\n",
118 cmp_id, ocu_i->u_name); 118 cmp_id, ocu_i->u_name);
119 return 0; 119 return 0;
120 } 120 }
@@ -242,7 +242,7 @@ try_again:
242 if (utf_cnt) { 242 if (utf_cnt) {
243error_out: 243error_out:
244 ocu[++u_len] = '?'; 244 ocu[++u_len] = '?';
245 printk(KERN_DEBUG "udf: bad UTF-8 character\n"); 245 printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
246 } 246 }
247 247
248 ocu[length - 1] = (uint8_t)u_len + 1; 248 ocu[length - 1] = (uint8_t)u_len + 1;
@@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
267 cmp_id = ocu_i->u_cmpID; 267 cmp_id = ocu_i->u_cmpID;
268 if (cmp_id != 8 && cmp_id != 16) { 268 if (cmp_id != 8 && cmp_id != 16) {
269 memset(utf_o, 0, sizeof(struct ustr)); 269 memset(utf_o, 0, sizeof(struct ustr));
270 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", 270 pr_err("unknown compression code (%d) stri=%s\n",
271 cmp_id, ocu_i->u_name); 271 cmp_id, ocu_i->u_name);
272 return 0; 272 return 0;
273 } 273 }
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2eabf04af3d..78a4c70d46b 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -341,7 +341,7 @@ cg_found:
341 341
342fail_remove_inode: 342fail_remove_inode:
343 unlock_super(sb); 343 unlock_super(sb);
344 inode->i_nlink = 0; 344 clear_nlink(inode);
345 iput(inode); 345 iput(inode);
346 UFSD("EXIT (FAILED): err %d\n", err); 346 UFSD("EXIT (FAILED): err %d\n", err);
347 return ERR_PTR(err); 347 return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index b4d791a8320..879b13436fa 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
589 * Copy data to the in-core inode. 589 * Copy data to the in-core inode.
590 */ 590 */
591 inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); 591 inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
592 inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); 592 set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
593 if (inode->i_nlink == 0) { 593 if (inode->i_nlink == 0) {
594 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); 594 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
595 return -1; 595 return -1;
@@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
637 * Copy data to the in-core inode. 637 * Copy data to the in-core inode.
638 */ 638 */
639 inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); 639 inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
640 inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); 640 set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
641 if (inode->i_nlink == 0) { 641 if (inode->i_nlink == 0) {
642 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); 642 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
643 return -1; 643 return -1;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 5be2755dd71..c26f2bcec26 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
117extern const struct file_operations ufs_dir_operations; 117extern const struct file_operations ufs_dir_operations;
118 118
119/* super.c */ 119/* super.c */
120extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); 120extern __printf(3, 4)
121extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); 121void ufs_warning(struct super_block *, const char *, const char *, ...);
122extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); 122extern __printf(3, 4)
123void ufs_error(struct super_block *, const char *, const char *, ...);
124extern __printf(3, 4)
125void ufs_panic(struct super_block *, const char *, const char *, ...);
123 126
124/* symlink.c */ 127/* symlink.c */
125extern const struct inode_operations ufs_fast_symlink_inode_operations; 128extern const struct inode_operations ufs_fast_symlink_inode_operations;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdd9cb54d63..ce84ffd0264 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -452,7 +452,7 @@ xfs_alloc_read_agfl(
452 if (error) 452 if (error)
453 return error; 453 return error;
454 ASSERT(!xfs_buf_geterror(bp)); 454 ASSERT(!xfs_buf_geterror(bp));
455 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); 455 xfs_buf_set_ref(bp, XFS_AGFL_REF);
456 *bpp = bp; 456 *bpp = bp;
457 return 0; 457 return 0;
458} 458}
@@ -2139,7 +2139,7 @@ xfs_read_agf(
2139 xfs_trans_brelse(tp, *bpp); 2139 xfs_trans_brelse(tp, *bpp);
2140 return XFS_ERROR(EFSCORRUPTED); 2140 return XFS_ERROR(EFSCORRUPTED);
2141 } 2141 }
2142 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2142 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2143 return 0; 2143 return 0;
2144} 2144}
2145 2145
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8c37dde4c52..574d4ee9b62 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41
42/*
43 * Prime number of hash buckets since address is used as the key.
44 */
45#define NVSYNC 37
46#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
47static wait_queue_head_t xfs_ioend_wq[NVSYNC];
48
49void __init
50xfs_ioend_init(void)
51{
52 int i;
53
54 for (i = 0; i < NVSYNC; i++)
55 init_waitqueue_head(&xfs_ioend_wq[i]);
56}
57
58void
59xfs_ioend_wait(
60 xfs_inode_t *ip)
61{
62 wait_queue_head_t *wq = to_ioend_wq(ip);
63
64 wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
65}
66
67STATIC void
68xfs_ioend_wake(
69 xfs_inode_t *ip)
70{
71 if (atomic_dec_and_test(&ip->i_iocount))
72 wake_up(to_ioend_wq(ip));
73}
74
75void 41void
76xfs_count_page_state( 42xfs_count_page_state(
77 struct page *page, 43 struct page *page,
@@ -115,25 +81,20 @@ xfs_destroy_ioend(
115 xfs_ioend_t *ioend) 81 xfs_ioend_t *ioend)
116{ 82{
117 struct buffer_head *bh, *next; 83 struct buffer_head *bh, *next;
118 struct xfs_inode *ip = XFS_I(ioend->io_inode);
119 84
120 for (bh = ioend->io_buffer_head; bh; bh = next) { 85 for (bh = ioend->io_buffer_head; bh; bh = next) {
121 next = bh->b_private; 86 next = bh->b_private;
122 bh->b_end_io(bh, !ioend->io_error); 87 bh->b_end_io(bh, !ioend->io_error);
123 } 88 }
124 89
125 /* 90 if (ioend->io_iocb) {
126 * Volume managers supporting multiple paths can send back ENODEV 91 if (ioend->io_isasync) {
127 * when the final path disappears. In this case continuing to fill 92 aio_complete(ioend->io_iocb, ioend->io_error ?
128 * the page cache with dirty data which cannot be written out is 93 ioend->io_error : ioend->io_result, 0);
129 * evil, so prevent that. 94 }
130 */ 95 inode_dio_done(ioend->io_inode);
131 if (unlikely(ioend->io_error == -ENODEV)) {
132 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
133 __FILE__, __LINE__);
134 } 96 }
135 97
136 xfs_ioend_wake(ip);
137 mempool_free(ioend, xfs_ioend_pool); 98 mempool_free(ioend, xfs_ioend_pool);
138} 99}
139 100
@@ -156,6 +117,15 @@ xfs_ioend_new_eof(
156} 117}
157 118
158/* 119/*
120 * Fast and loose check if this write could update the on-disk inode size.
121 */
122static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
123{
124 return ioend->io_offset + ioend->io_size >
125 XFS_I(ioend->io_inode)->i_d.di_size;
126}
127
128/*
159 * Update on-disk file size now that data has been written to disk. The 129 * Update on-disk file size now that data has been written to disk. The
160 * current in-memory file size is i_size. If a write is beyond eof i_new_size 130 * current in-memory file size is i_size. If a write is beyond eof i_new_size
161 * will be the intended file size until i_size is updated. If this write does 131 * will be the intended file size until i_size is updated. If this write does
@@ -173,9 +143,6 @@ xfs_setfilesize(
173 xfs_inode_t *ip = XFS_I(ioend->io_inode); 143 xfs_inode_t *ip = XFS_I(ioend->io_inode);
174 xfs_fsize_t isize; 144 xfs_fsize_t isize;
175 145
176 if (unlikely(ioend->io_error))
177 return 0;
178
179 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 146 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
180 return EAGAIN; 147 return EAGAIN;
181 148
@@ -192,6 +159,9 @@ xfs_setfilesize(
192 159
193/* 160/*
194 * Schedule IO completion handling on the final put of an ioend. 161 * Schedule IO completion handling on the final put of an ioend.
162 *
163 * If there is no work to do we might as well call it a day and free the
164 * ioend right now.
195 */ 165 */
196STATIC void 166STATIC void
197xfs_finish_ioend( 167xfs_finish_ioend(
@@ -200,8 +170,10 @@ xfs_finish_ioend(
200 if (atomic_dec_and_test(&ioend->io_remaining)) { 170 if (atomic_dec_and_test(&ioend->io_remaining)) {
201 if (ioend->io_type == IO_UNWRITTEN) 171 if (ioend->io_type == IO_UNWRITTEN)
202 queue_work(xfsconvertd_workqueue, &ioend->io_work); 172 queue_work(xfsconvertd_workqueue, &ioend->io_work);
203 else 173 else if (xfs_ioend_is_append(ioend))
204 queue_work(xfsdatad_workqueue, &ioend->io_work); 174 queue_work(xfsdatad_workqueue, &ioend->io_work);
175 else
176 xfs_destroy_ioend(ioend);
205 } 177 }
206} 178}
207 179
@@ -216,17 +188,24 @@ xfs_end_io(
216 struct xfs_inode *ip = XFS_I(ioend->io_inode); 188 struct xfs_inode *ip = XFS_I(ioend->io_inode);
217 int error = 0; 189 int error = 0;
218 190
191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
192 ioend->io_error = -EIO;
193 goto done;
194 }
195 if (ioend->io_error)
196 goto done;
197
219 /* 198 /*
220 * For unwritten extents we need to issue transactions to convert a 199 * For unwritten extents we need to issue transactions to convert a
221 * range to normal written extens after the data I/O has finished. 200 * range to normal written extens after the data I/O has finished.
222 */ 201 */
223 if (ioend->io_type == IO_UNWRITTEN && 202 if (ioend->io_type == IO_UNWRITTEN) {
224 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
225
226 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 203 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
227 ioend->io_size); 204 ioend->io_size);
228 if (error) 205 if (error) {
229 ioend->io_error = error; 206 ioend->io_error = -error;
207 goto done;
208 }
230 } 209 }
231 210
232 /* 211 /*
@@ -236,6 +215,7 @@ xfs_end_io(
236 error = xfs_setfilesize(ioend); 215 error = xfs_setfilesize(ioend);
237 ASSERT(!error || error == EAGAIN); 216 ASSERT(!error || error == EAGAIN);
238 217
218done:
239 /* 219 /*
240 * If we didn't complete processing of the ioend, requeue it to the 220 * If we didn't complete processing of the ioend, requeue it to the
241 * tail of the workqueue for another attempt later. Otherwise destroy 221 * tail of the workqueue for another attempt later. Otherwise destroy
@@ -247,8 +227,6 @@ xfs_end_io(
247 /* ensure we don't spin on blocked ioends */ 227 /* ensure we don't spin on blocked ioends */
248 delay(1); 228 delay(1);
249 } else { 229 } else {
250 if (ioend->io_iocb)
251 aio_complete(ioend->io_iocb, ioend->io_result, 0);
252 xfs_destroy_ioend(ioend); 230 xfs_destroy_ioend(ioend);
253 } 231 }
254} 232}
@@ -285,13 +263,13 @@ xfs_alloc_ioend(
285 * all the I/O from calling the completion routine too early. 263 * all the I/O from calling the completion routine too early.
286 */ 264 */
287 atomic_set(&ioend->io_remaining, 1); 265 atomic_set(&ioend->io_remaining, 1);
266 ioend->io_isasync = 0;
288 ioend->io_error = 0; 267 ioend->io_error = 0;
289 ioend->io_list = NULL; 268 ioend->io_list = NULL;
290 ioend->io_type = type; 269 ioend->io_type = type;
291 ioend->io_inode = inode; 270 ioend->io_inode = inode;
292 ioend->io_buffer_head = NULL; 271 ioend->io_buffer_head = NULL;
293 ioend->io_buffer_tail = NULL; 272 ioend->io_buffer_tail = NULL;
294 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
295 ioend->io_offset = 0; 273 ioend->io_offset = 0;
296 ioend->io_size = 0; 274 ioend->io_size = 0;
297 ioend->io_iocb = NULL; 275 ioend->io_iocb = NULL;
@@ -337,8 +315,8 @@ xfs_map_blocks(
337 count = mp->m_maxioffset - offset; 315 count = mp->m_maxioffset - offset;
338 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 316 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
339 offset_fsb = XFS_B_TO_FSBT(mp, offset); 317 offset_fsb = XFS_B_TO_FSBT(mp, offset);
340 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, 318 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
341 bmapi_flags, NULL, 0, imap, &nimaps, NULL); 319 imap, &nimaps, bmapi_flags);
342 xfs_iunlock(ip, XFS_ILOCK_SHARED); 320 xfs_iunlock(ip, XFS_ILOCK_SHARED);
343 321
344 if (error) 322 if (error)
@@ -551,7 +529,6 @@ xfs_cancel_ioend(
551 unlock_buffer(bh); 529 unlock_buffer(bh);
552 } while ((bh = next_bh) != NULL); 530 } while ((bh = next_bh) != NULL);
553 531
554 xfs_ioend_wake(XFS_I(ioend->io_inode));
555 mempool_free(ioend, xfs_ioend_pool); 532 mempool_free(ioend, xfs_ioend_pool);
556 } while ((ioend = next) != NULL); 533 } while ((ioend = next) != NULL);
557} 534}
@@ -925,11 +902,11 @@ xfs_vm_writepage(
925 * random callers for direct reclaim or memcg reclaim. We explicitly 902 * random callers for direct reclaim or memcg reclaim. We explicitly
926 * allow reclaim from kswapd as the stack usage there is relatively low. 903 * allow reclaim from kswapd as the stack usage there is relatively low.
927 * 904 *
928 * This should really be done by the core VM, but until that happens 905 * This should never happen except in the case of a VM regression so
929 * filesystems like XFS, btrfs and ext4 have to take care of this 906 * warn about it.
930 * by themselves.
931 */ 907 */
932 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 908 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
909 PF_MEMALLOC))
933 goto redirty; 910 goto redirty;
934 911
935 /* 912 /*
@@ -1161,8 +1138,8 @@ __xfs_get_blocks(
1161 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1138 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1162 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1139 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1163 1140
1164 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, 1141 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1165 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); 1142 &imap, &nimaps, XFS_BMAPI_ENTIRE);
1166 if (error) 1143 if (error)
1167 goto out_unlock; 1144 goto out_unlock;
1168 1145
@@ -1300,7 +1277,6 @@ xfs_end_io_direct_write(
1300 bool is_async) 1277 bool is_async)
1301{ 1278{
1302 struct xfs_ioend *ioend = iocb->private; 1279 struct xfs_ioend *ioend = iocb->private;
1303 struct inode *inode = ioend->io_inode;
1304 1280
1305 /* 1281 /*
1306 * blockdev_direct_IO can return an error even after the I/O 1282 * blockdev_direct_IO can return an error even after the I/O
@@ -1311,28 +1287,17 @@ xfs_end_io_direct_write(
1311 1287
1312 ioend->io_offset = offset; 1288 ioend->io_offset = offset;
1313 ioend->io_size = size; 1289 ioend->io_size = size;
1290 ioend->io_iocb = iocb;
1291 ioend->io_result = ret;
1314 if (private && size > 0) 1292 if (private && size > 0)
1315 ioend->io_type = IO_UNWRITTEN; 1293 ioend->io_type = IO_UNWRITTEN;
1316 1294
1317 if (is_async) { 1295 if (is_async) {
1318 /* 1296 ioend->io_isasync = 1;
1319 * If we are converting an unwritten extent we need to delay
1320 * the AIO completion until after the unwrittent extent
1321 * conversion has completed, otherwise do it ASAP.
1322 */
1323 if (ioend->io_type == IO_UNWRITTEN) {
1324 ioend->io_iocb = iocb;
1325 ioend->io_result = ret;
1326 } else {
1327 aio_complete(iocb, ret, 0);
1328 }
1329 xfs_finish_ioend(ioend); 1297 xfs_finish_ioend(ioend);
1330 } else { 1298 } else {
1331 xfs_finish_ioend_sync(ioend); 1299 xfs_finish_ioend_sync(ioend);
1332 } 1300 }
1333
1334 /* XXX: probably should move into the real I/O completion handler */
1335 inode_dio_done(inode);
1336} 1301}
1337 1302
1338STATIC ssize_t 1303STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71..116dd5c3703 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
47 unsigned int io_type; /* delalloc / unwritten */ 47 unsigned int io_type; /* delalloc / unwritten */
48 int io_error; /* I/O error code */ 48 int io_error; /* I/O error code */
49 atomic_t io_remaining; /* hold count */ 49 atomic_t io_remaining; /* hold count */
50 unsigned int io_isasync : 1; /* needs aio_complete */
50 struct inode *io_inode; /* file being written to */ 51 struct inode *io_inode; /* file being written to */
51 struct buffer_head *io_buffer_head;/* buffer linked list head */ 52 struct buffer_head *io_buffer_head;/* buffer linked list head */
52 struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 53 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
@@ -60,9 +61,6 @@ typedef struct xfs_ioend {
60extern const struct address_space_operations xfs_address_space_operations; 61extern const struct address_space_operations xfs_address_space_operations;
61extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 62extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
62 63
63extern void xfs_ioend_init(void);
64extern void xfs_ioend_wait(struct xfs_inode *);
65
66extern void xfs_count_page_state(struct page *, int *, int *); 64extern void xfs_count_page_state(struct page *, int *, int *);
67 65
68#endif /* __XFS_AOPS_H__ */ 66#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 160bcdc34a6..1e5d97f86ea 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
319 return (error); 319 return (error);
320 } 320 }
321 321
322 xfs_trans_ijoin(args.trans, dp); 322 xfs_trans_ijoin(args.trans, dp, 0);
323 323
324 /* 324 /*
325 * If the attribute list is non-existent or a shortform list, 325 * If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
389 * a new one. We need the inode to be in all transactions. 389 * a new one. We need the inode to be in all transactions.
390 */ 390 */
391 if (committed) 391 if (committed)
392 xfs_trans_ijoin(args.trans, dp); 392 xfs_trans_ijoin(args.trans, dp, 0);
393 393
394 /* 394 /*
395 * Commit the leaf transformation. We'll need another (linked) 395 * Commit the leaf transformation. We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
537 * No need to make quota reservations here. We expect to release some 537 * No need to make quota reservations here. We expect to release some
538 * blocks not allocate in the common case. 538 * blocks not allocate in the common case.
539 */ 539 */
540 xfs_trans_ijoin(args.trans, dp); 540 xfs_trans_ijoin(args.trans, dp, 0);
541 541
542 /* 542 /*
543 * Decide on what work routines to call based on the inode size. 543 * Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
809 * No need to make quota reservations here. We expect to release some 809 * No need to make quota reservations here. We expect to release some
810 * blocks, not allocate, in the common case. 810 * blocks, not allocate, in the common case.
811 */ 811 */
812 xfs_trans_ijoin(trans, dp); 812 xfs_trans_ijoin(trans, dp, 0);
813 813
814 /* 814 /*
815 * Decide on what work routines to call based on the inode size. 815 * Decide on what work routines to call based on the inode size.
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
823 if (error) 823 if (error)
824 goto out; 824 goto out;
825 825
826 /*
827 * Signal synchronous inactive transactions unless this is a
828 * synchronous mount filesystem in which case we know that we're here
829 * because we've been called out of xfs_inactive which means that the
830 * last reference is gone and the unlink transaction has already hit
831 * the disk so async inactive transactions are safe.
832 */
833 if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
834 if (dp->i_d.di_anextents > 0)
835 xfs_trans_set_sync(trans);
836 }
837
838 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); 826 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
839 if (error) 827 if (error)
840 goto out; 828 goto out;
@@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
973 * a new one. We need the inode to be in all transactions. 961 * a new one. We need the inode to be in all transactions.
974 */ 962 */
975 if (committed) 963 if (committed)
976 xfs_trans_ijoin(args->trans, dp); 964 xfs_trans_ijoin(args->trans, dp, 0);
977 965
978 /* 966 /*
979 * Commit the current trans (including the inode) and start 967 * Commit the current trans (including the inode) and start
@@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1075 * in all transactions. 1063 * in all transactions.
1076 */ 1064 */
1077 if (committed) 1065 if (committed)
1078 xfs_trans_ijoin(args->trans, dp); 1066 xfs_trans_ijoin(args->trans, dp, 0);
1079 } else 1067 } else
1080 xfs_da_buf_done(bp); 1068 xfs_da_buf_done(bp);
1081 1069
@@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1149 * a new one. We need the inode to be in all transactions. 1137 * a new one. We need the inode to be in all transactions.
1150 */ 1138 */
1151 if (committed) 1139 if (committed)
1152 xfs_trans_ijoin(args->trans, dp); 1140 xfs_trans_ijoin(args->trans, dp, 0);
1153 } else 1141 } else
1154 xfs_da_buf_done(bp); 1142 xfs_da_buf_done(bp);
1155 return(0); 1143 return(0);
@@ -1303,7 +1291,7 @@ restart:
1303 * in all transactions. 1291 * in all transactions.
1304 */ 1292 */
1305 if (committed) 1293 if (committed)
1306 xfs_trans_ijoin(args->trans, dp); 1294 xfs_trans_ijoin(args->trans, dp, 0);
1307 1295
1308 /* 1296 /*
1309 * Commit the node conversion and start the next 1297 * Commit the node conversion and start the next
@@ -1340,7 +1328,7 @@ restart:
1340 * a new one. We need the inode to be in all transactions. 1328 * a new one. We need the inode to be in all transactions.
1341 */ 1329 */
1342 if (committed) 1330 if (committed)
1343 xfs_trans_ijoin(args->trans, dp); 1331 xfs_trans_ijoin(args->trans, dp, 0);
1344 } else { 1332 } else {
1345 /* 1333 /*
1346 * Addition succeeded, update Btree hashvals. 1334 * Addition succeeded, update Btree hashvals.
@@ -1452,7 +1440,7 @@ restart:
1452 * in all transactions. 1440 * in all transactions.
1453 */ 1441 */
1454 if (committed) 1442 if (committed)
1455 xfs_trans_ijoin(args->trans, dp); 1443 xfs_trans_ijoin(args->trans, dp, 0);
1456 } 1444 }
1457 1445
1458 /* 1446 /*
@@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1584 * a new one. We need the inode to be in all transactions. 1572 * a new one. We need the inode to be in all transactions.
1585 */ 1573 */
1586 if (committed) 1574 if (committed)
1587 xfs_trans_ijoin(args->trans, dp); 1575 xfs_trans_ijoin(args->trans, dp, 0);
1588 1576
1589 /* 1577 /*
1590 * Commit the Btree join operation and start a new trans. 1578 * Commit the Btree join operation and start a new trans.
@@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1635 * in all transactions. 1623 * in all transactions.
1636 */ 1624 */
1637 if (committed) 1625 if (committed)
1638 xfs_trans_ijoin(args->trans, dp); 1626 xfs_trans_ijoin(args->trans, dp, 0);
1639 } else 1627 } else
1640 xfs_da_brelse(args->trans, bp); 1628 xfs_da_brelse(args->trans, bp);
1641 } 1629 }
@@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1975 lblkno = args->rmtblkno; 1963 lblkno = args->rmtblkno;
1976 while (valuelen > 0) { 1964 while (valuelen > 0) {
1977 nmap = ATTR_RMTVALUE_MAPSIZE; 1965 nmap = ATTR_RMTVALUE_MAPSIZE;
1978 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, 1966 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
1979 args->rmtblkcnt, 1967 args->rmtblkcnt, map, &nmap,
1980 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1968 XFS_BMAPI_ATTRFORK);
1981 NULL, 0, map, &nmap, NULL);
1982 if (error) 1969 if (error)
1983 return(error); 1970 return(error);
1984 ASSERT(nmap >= 1); 1971 ASSERT(nmap >= 1);
@@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2052 */ 2039 */
2053 xfs_bmap_init(args->flist, args->firstblock); 2040 xfs_bmap_init(args->flist, args->firstblock);
2054 nmap = 1; 2041 nmap = 1;
2055 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, 2042 error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
2056 blkcnt, 2043 blkcnt,
2057 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | 2044 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2058 XFS_BMAPI_WRITE,
2059 args->firstblock, args->total, &map, &nmap, 2045 args->firstblock, args->total, &map, &nmap,
2060 args->flist); 2046 args->flist);
2061 if (!error) { 2047 if (!error) {
@@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2074 * a new one. We need the inode to be in all transactions. 2060 * a new one. We need the inode to be in all transactions.
2075 */ 2061 */
2076 if (committed) 2062 if (committed)
2077 xfs_trans_ijoin(args->trans, dp); 2063 xfs_trans_ijoin(args->trans, dp, 0);
2078 2064
2079 ASSERT(nmap == 1); 2065 ASSERT(nmap == 1);
2080 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2066 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2104 */ 2090 */
2105 xfs_bmap_init(args->flist, args->firstblock); 2091 xfs_bmap_init(args->flist, args->firstblock);
2106 nmap = 1; 2092 nmap = 1;
2107 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, 2093 error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
2108 args->rmtblkcnt, 2094 args->rmtblkcnt, &map, &nmap,
2109 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2095 XFS_BMAPI_ATTRFORK);
2110 args->firstblock, 0, &map, &nmap, 2096 if (error)
2111 NULL);
2112 if (error) {
2113 return(error); 2097 return(error);
2114 }
2115 ASSERT(nmap == 1); 2098 ASSERT(nmap == 1);
2116 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2099 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2117 (map.br_startblock != HOLESTARTBLOCK)); 2100 (map.br_startblock != HOLESTARTBLOCK));
@@ -2121,16 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2121 2104
2122 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2105 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2123 XBF_LOCK | XBF_DONT_BLOCK); 2106 XBF_LOCK | XBF_DONT_BLOCK);
2124 ASSERT(!xfs_buf_geterror(bp)); 2107 if (!bp)
2125 2108 return ENOMEM;
2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2109 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2127 XFS_BUF_SIZE(bp); 2110 XFS_BUF_SIZE(bp);
2128 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); 2111 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2129 if (tmp < XFS_BUF_SIZE(bp)) 2112 if (tmp < XFS_BUF_SIZE(bp))
2130 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2113 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2131 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2114 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
2132 return (error); 2115 xfs_buf_relse(bp);
2133 } 2116 if (error)
2117 return error;
2134 src += tmp; 2118 src += tmp;
2135 valuelen -= tmp; 2119 valuelen -= tmp;
2136 2120
@@ -2166,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2166 /* 2150 /*
2167 * Try to remember where we decided to put the value. 2151 * Try to remember where we decided to put the value.
2168 */ 2152 */
2169 xfs_bmap_init(args->flist, args->firstblock);
2170 nmap = 1; 2153 nmap = 1;
2171 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, 2154 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
2172 args->rmtblkcnt, 2155 args->rmtblkcnt, &map, &nmap,
2173 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2156 XFS_BMAPI_ATTRFORK);
2174 args->firstblock, 0, &map, &nmap, 2157 if (error)
2175 args->flist);
2176 if (error) {
2177 return(error); 2158 return(error);
2178 }
2179 ASSERT(nmap == 1); 2159 ASSERT(nmap == 1);
2180 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2160 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2181 (map.br_startblock != HOLESTARTBLOCK)); 2161 (map.br_startblock != HOLESTARTBLOCK));
@@ -2188,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2188 */ 2168 */
2189 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); 2169 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2190 if (bp) { 2170 if (bp) {
2191 XFS_BUF_STALE(bp); 2171 xfs_buf_stale(bp);
2192 XFS_BUF_UNDELAYWRITE(bp);
2193 xfs_buf_relse(bp); 2172 xfs_buf_relse(bp);
2194 bp = NULL; 2173 bp = NULL;
2195 } 2174 }
@@ -2227,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2227 * a new one. We need the inode to be in all transactions. 2206 * a new one. We need the inode to be in all transactions.
2228 */ 2207 */
2229 if (committed) 2208 if (committed)
2230 xfs_trans_ijoin(args->trans, args->dp); 2209 xfs_trans_ijoin(args->trans, args->dp, 0);
2231 2210
2232 /* 2211 /*
2233 * Close out trans and start the next one in the chain. 2212 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542..d4906e7c978 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2926 * Try to remember where we decided to put the value. 2926 * Try to remember where we decided to put the value.
2927 */ 2927 */
2928 nmap = 1; 2928 nmap = 1;
2929 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, 2929 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
2930 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2930 &map, &nmap, XFS_BMAPI_ATTRFORK);
2931 NULL, 0, &map, &nmap, NULL);
2932 if (error) { 2931 if (error) {
2933 return(error); 2932 return(error);
2934 } 2933 }
@@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2948 bp = xfs_trans_get_buf(*trans, 2947 bp = xfs_trans_get_buf(*trans,
2949 dp->i_mount->m_ddev_targp, 2948 dp->i_mount->m_ddev_targp,
2950 dblkno, dblkcnt, XBF_LOCK); 2949 dblkno, dblkcnt, XBF_LOCK);
2950 if (!bp)
2951 return ENOMEM;
2951 xfs_trans_binval(*trans, bp); 2952 xfs_trans_binval(*trans, bp);
2952 /* 2953 /*
2953 * Roll to next transaction. 2954 * Roll to next transaction.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 452a291383a..c68baeb0974 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
50#include "xfs_trace.h" 50#include "xfs_trace.h"
51 51
52 52
53#ifdef DEBUG
54STATIC void
55xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
56#endif
57
58kmem_zone_t *xfs_bmap_free_item_zone; 53kmem_zone_t *xfs_bmap_free_item_zone;
59 54
60/* 55/*
61 * Prototypes for internal bmap routines. 56 * Prototypes for internal bmap routines.
62 */ 57 */
63 58
59#ifdef DEBUG
60STATIC void
61xfs_bmap_check_leaf_extents(
62 struct xfs_btree_cur *cur,
63 struct xfs_inode *ip,
64 int whichfork);
65#else
66#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
67#endif
68
64 69
65/* 70/*
66 * Called from xfs_bmap_add_attrfork to handle extents format files. 71 * Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local(
85 int *flags); /* inode logging flags */ 90 int *flags); /* inode logging flags */
86 91
87/* 92/*
88 * Called by xfs_bmap_add_extent to handle cases converting a delayed
89 * allocation to a real allocation.
90 */
91STATIC int /* error */
92xfs_bmap_add_extent_delay_real(
93 struct xfs_trans *tp, /* transaction pointer */
94 xfs_inode_t *ip, /* incore inode pointer */
95 xfs_extnum_t *idx, /* extent number to update/insert */
96 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
97 xfs_bmbt_irec_t *new, /* new data to add to file extents */
98 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
99 xfs_fsblock_t *first, /* pointer to firstblock variable */
100 xfs_bmap_free_t *flist, /* list of extents to be freed */
101 int *logflagsp); /* inode logging flags */
102
103/*
104 * Called by xfs_bmap_add_extent to handle cases converting a hole
105 * to a delayed allocation.
106 */
107STATIC int /* error */
108xfs_bmap_add_extent_hole_delay(
109 xfs_inode_t *ip, /* incore inode pointer */
110 xfs_extnum_t *idx, /* extent number to update/insert */
111 xfs_bmbt_irec_t *new, /* new data to add to file extents */
112 int *logflagsp); /* inode logging flags */
113
114/*
115 * Called by xfs_bmap_add_extent to handle cases converting a hole
116 * to a real allocation.
117 */
118STATIC int /* error */
119xfs_bmap_add_extent_hole_real(
120 xfs_inode_t *ip, /* incore inode pointer */
121 xfs_extnum_t *idx, /* extent number to update/insert */
122 xfs_btree_cur_t *cur, /* if null, not a btree */
123 xfs_bmbt_irec_t *new, /* new data to add to file extents */
124 int *logflagsp, /* inode logging flags */
125 int whichfork); /* data or attr fork */
126
127/*
128 * Called by xfs_bmap_add_extent to handle cases converting an unwritten
129 * allocation to a real allocation or vice versa.
130 */
131STATIC int /* error */
132xfs_bmap_add_extent_unwritten_real(
133 xfs_inode_t *ip, /* incore inode pointer */
134 xfs_extnum_t *idx, /* extent number to update/insert */
135 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
136 xfs_bmbt_irec_t *new, /* new data to add to file extents */
137 int *logflagsp); /* inode logging flags */
138
139/*
140 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. 93 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
141 * It figures out where to ask the underlying allocator to put the new extent. 94 * It figures out where to ask the underlying allocator to put the new extent.
142 */ 95 */
@@ -215,19 +168,6 @@ xfs_bmap_search_extents(
215 xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ 168 xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
216 169
217/* 170/*
218 * Check the last inode extent to determine whether this allocation will result
219 * in blocks being allocated at the end of the file. When we allocate new data
220 * blocks at the end of the file which do not start at the previous data block,
221 * we will try to align the new blocks at stripe unit boundaries.
222 */
223STATIC int /* error */
224xfs_bmap_isaeof(
225 xfs_inode_t *ip, /* incore inode pointer */
226 xfs_fileoff_t off, /* file offset in fsblocks */
227 int whichfork, /* data or attribute fork */
228 char *aeof); /* return value */
229
230/*
231 * Compute the worst-case number of indirect blocks that will be used 171 * Compute the worst-case number of indirect blocks that will be used
232 * for ip's delayed extent of length "len". 172 * for ip's delayed extent of length "len".
233 */ 173 */
@@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local(
431} 371}
432 372
433/* 373/*
434 * Called by xfs_bmapi to update file extent records and the btree 374 * Convert a delayed allocation to a real allocation.
435 * after allocating space (or doing a delayed allocation).
436 */
437STATIC int /* error */
438xfs_bmap_add_extent(
439 struct xfs_trans *tp, /* transaction pointer */
440 xfs_inode_t *ip, /* incore inode pointer */
441 xfs_extnum_t *idx, /* extent number to update/insert */
442 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
443 xfs_bmbt_irec_t *new, /* new data to add to file extents */
444 xfs_fsblock_t *first, /* pointer to firstblock variable */
445 xfs_bmap_free_t *flist, /* list of extents to be freed */
446 int *logflagsp, /* inode logging flags */
447 int whichfork) /* data or attr fork */
448{
449 xfs_btree_cur_t *cur; /* btree cursor or null */
450 xfs_filblks_t da_new; /* new count del alloc blocks used */
451 xfs_filblks_t da_old; /* old count del alloc blocks used */
452 int error; /* error return value */
453 xfs_ifork_t *ifp; /* inode fork ptr */
454 int logflags; /* returned value */
455 xfs_extnum_t nextents; /* number of extents in file now */
456
457 XFS_STATS_INC(xs_add_exlist);
458
459 cur = *curp;
460 ifp = XFS_IFORK_PTR(ip, whichfork);
461 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
462 da_old = da_new = 0;
463 error = 0;
464
465 ASSERT(*idx >= 0);
466 ASSERT(*idx <= nextents);
467
468 /*
469 * This is the first extent added to a new/empty file.
470 * Special case this one, so other routines get to assume there are
471 * already extents in the list.
472 */
473 if (nextents == 0) {
474 xfs_iext_insert(ip, *idx, 1, new,
475 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
476
477 ASSERT(cur == NULL);
478
479 if (!isnullstartblock(new->br_startblock)) {
480 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
481 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
482 } else
483 logflags = 0;
484 }
485 /*
486 * Any kind of new delayed allocation goes here.
487 */
488 else if (isnullstartblock(new->br_startblock)) {
489 if (cur)
490 ASSERT((cur->bc_private.b.flags &
491 XFS_BTCUR_BPRV_WASDEL) == 0);
492 error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
493 &logflags);
494 }
495 /*
496 * Real allocation off the end of the file.
497 */
498 else if (*idx == nextents) {
499 if (cur)
500 ASSERT((cur->bc_private.b.flags &
501 XFS_BTCUR_BPRV_WASDEL) == 0);
502 error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
503 &logflags, whichfork);
504 } else {
505 xfs_bmbt_irec_t prev; /* old extent at offset idx */
506
507 /*
508 * Get the record referred to by idx.
509 */
510 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
511 /*
512 * If it's a real allocation record, and the new allocation ends
513 * after the start of the referred to record, then we're filling
514 * in a delayed or unwritten allocation with a real one, or
515 * converting real back to unwritten.
516 */
517 if (!isnullstartblock(new->br_startblock) &&
518 new->br_startoff + new->br_blockcount > prev.br_startoff) {
519 if (prev.br_state != XFS_EXT_UNWRITTEN &&
520 isnullstartblock(prev.br_startblock)) {
521 da_old = startblockval(prev.br_startblock);
522 if (cur)
523 ASSERT(cur->bc_private.b.flags &
524 XFS_BTCUR_BPRV_WASDEL);
525 error = xfs_bmap_add_extent_delay_real(tp, ip,
526 idx, &cur, new, &da_new,
527 first, flist, &logflags);
528 } else {
529 ASSERT(new->br_state == XFS_EXT_NORM ||
530 new->br_state == XFS_EXT_UNWRITTEN);
531
532 error = xfs_bmap_add_extent_unwritten_real(ip,
533 idx, &cur, new, &logflags);
534 if (error)
535 goto done;
536 }
537 }
538 /*
539 * Otherwise we're filling in a hole with an allocation.
540 */
541 else {
542 if (cur)
543 ASSERT((cur->bc_private.b.flags &
544 XFS_BTCUR_BPRV_WASDEL) == 0);
545 error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
546 new, &logflags, whichfork);
547 }
548 }
549
550 if (error)
551 goto done;
552 ASSERT(*curp == cur || *curp == NULL);
553
554 /*
555 * Convert to a btree if necessary.
556 */
557 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
558 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
559 int tmp_logflags; /* partial log flag return val */
560
561 ASSERT(cur == NULL);
562 error = xfs_bmap_extents_to_btree(tp, ip, first,
563 flist, &cur, da_old > 0, &tmp_logflags, whichfork);
564 logflags |= tmp_logflags;
565 if (error)
566 goto done;
567 }
568 /*
569 * Adjust for changes in reserved delayed indirect blocks.
570 * Nothing to do for disk quotas here.
571 */
572 if (da_old || da_new) {
573 xfs_filblks_t nblks;
574
575 nblks = da_new;
576 if (cur)
577 nblks += cur->bc_private.b.allocated;
578 ASSERT(nblks <= da_old);
579 if (nblks < da_old)
580 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
581 (int64_t)(da_old - nblks), 0);
582 }
583 /*
584 * Clear out the allocated field, done with it now in any case.
585 */
586 if (cur) {
587 cur->bc_private.b.allocated = 0;
588 *curp = cur;
589 }
590done:
591#ifdef DEBUG
592 if (!error)
593 xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
594#endif
595 *logflagsp = logflags;
596 return error;
597}
598
599/*
600 * Called by xfs_bmap_add_extent to handle cases converting a delayed
601 * allocation to a real allocation.
602 */ 375 */
603STATIC int /* error */ 376STATIC int /* error */
604xfs_bmap_add_extent_delay_real( 377xfs_bmap_add_extent_delay_real(
605 struct xfs_trans *tp, /* transaction pointer */ 378 struct xfs_bmalloca *bma)
606 xfs_inode_t *ip, /* incore inode pointer */
607 xfs_extnum_t *idx, /* extent number to update/insert */
608 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
609 xfs_bmbt_irec_t *new, /* new data to add to file extents */
610 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
611 xfs_fsblock_t *first, /* pointer to firstblock variable */
612 xfs_bmap_free_t *flist, /* list of extents to be freed */
613 int *logflagsp) /* inode logging flags */
614{ 379{
615 xfs_btree_cur_t *cur; /* btree cursor */ 380 struct xfs_bmbt_irec *new = &bma->got;
616 int diff; /* temp value */ 381 int diff; /* temp value */
617 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ 382 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
618 int error; /* error return value */ 383 int error; /* error return value */
@@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real(
623 /* left is 0, right is 1, prev is 2 */ 388 /* left is 0, right is 1, prev is 2 */
624 int rval=0; /* return value (logging flags) */ 389 int rval=0; /* return value (logging flags) */
625 int state = 0;/* state bits, accessed thru macros */ 390 int state = 0;/* state bits, accessed thru macros */
626 xfs_filblks_t temp=0; /* value for dnew calculations */ 391 xfs_filblks_t da_new; /* new count del alloc blocks used */
627 xfs_filblks_t temp2=0;/* value for dnew calculations */ 392 xfs_filblks_t da_old; /* old count del alloc blocks used */
393 xfs_filblks_t temp=0; /* value for da_new calculations */
394 xfs_filblks_t temp2=0;/* value for da_new calculations */
628 int tmp_rval; /* partial logging flags */ 395 int tmp_rval; /* partial logging flags */
629 396
397 ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
398
399 ASSERT(bma->idx >= 0);
400 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
401 ASSERT(!isnullstartblock(new->br_startblock));
402 ASSERT(!bma->cur ||
403 (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
404
405 XFS_STATS_INC(xs_add_exlist);
406
630#define LEFT r[0] 407#define LEFT r[0]
631#define RIGHT r[1] 408#define RIGHT r[1]
632#define PREV r[2] 409#define PREV r[2]
@@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real(
634 /* 411 /*
635 * Set up a bunch of variables to make the tests simpler. 412 * Set up a bunch of variables to make the tests simpler.
636 */ 413 */
637 cur = *curp; 414 ep = xfs_iext_get_ext(ifp, bma->idx);
638 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
639 ep = xfs_iext_get_ext(ifp, *idx);
640 xfs_bmbt_get_all(ep, &PREV); 415 xfs_bmbt_get_all(ep, &PREV);
641 new_endoff = new->br_startoff + new->br_blockcount; 416 new_endoff = new->br_startoff + new->br_blockcount;
642 ASSERT(PREV.br_startoff <= new->br_startoff); 417 ASSERT(PREV.br_startoff <= new->br_startoff);
643 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); 418 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
644 419
420 da_old = startblockval(PREV.br_startblock);
421 da_new = 0;
422
645 /* 423 /*
646 * Set flags determining what part of the previous delayed allocation 424 * Set flags determining what part of the previous delayed allocation
647 * extent is being replaced by a real allocation. 425 * extent is being replaced by a real allocation.
@@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
655 * Check and set flags if this segment has a left neighbor. 433 * Check and set flags if this segment has a left neighbor.
656 * Don't set contiguous if the combined extent would be too large. 434 * Don't set contiguous if the combined extent would be too large.
657 */ 435 */
658 if (*idx > 0) { 436 if (bma->idx > 0) {
659 state |= BMAP_LEFT_VALID; 437 state |= BMAP_LEFT_VALID;
660 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); 438 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
661 439
662 if (isnullstartblock(LEFT.br_startblock)) 440 if (isnullstartblock(LEFT.br_startblock))
663 state |= BMAP_LEFT_DELAY; 441 state |= BMAP_LEFT_DELAY;
@@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
675 * Don't set contiguous if the combined extent would be too large. 453 * Don't set contiguous if the combined extent would be too large.
676 * Also check for all-three-contiguous being too large. 454 * Also check for all-three-contiguous being too large.
677 */ 455 */
678 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 456 if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
679 state |= BMAP_RIGHT_VALID; 457 state |= BMAP_RIGHT_VALID;
680 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); 458 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
681 459
682 if (isnullstartblock(RIGHT.br_startblock)) 460 if (isnullstartblock(RIGHT.br_startblock))
683 state |= BMAP_RIGHT_DELAY; 461 state |= BMAP_RIGHT_DELAY;
@@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real(
708 * Filling in all of a previously delayed allocation extent. 486 * Filling in all of a previously delayed allocation extent.
709 * The left and right neighbors are both contiguous with new. 487 * The left and right neighbors are both contiguous with new.
710 */ 488 */
711 --*idx; 489 bma->idx--;
712 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 490 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
713 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 491 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
714 LEFT.br_blockcount + PREV.br_blockcount + 492 LEFT.br_blockcount + PREV.br_blockcount +
715 RIGHT.br_blockcount); 493 RIGHT.br_blockcount);
716 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 494 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
717 495
718 xfs_iext_remove(ip, *idx + 1, 2, state); 496 xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
719 ip->i_d.di_nextents--; 497 bma->ip->i_d.di_nextents--;
720 if (cur == NULL) 498 if (bma->cur == NULL)
721 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 499 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
722 else { 500 else {
723 rval = XFS_ILOG_CORE; 501 rval = XFS_ILOG_CORE;
724 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 502 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
725 RIGHT.br_startblock, 503 RIGHT.br_startblock,
726 RIGHT.br_blockcount, &i))) 504 RIGHT.br_blockcount, &i);
505 if (error)
727 goto done; 506 goto done;
728 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 507 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
729 if ((error = xfs_btree_delete(cur, &i))) 508 error = xfs_btree_delete(bma->cur, &i);
509 if (error)
730 goto done; 510 goto done;
731 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 511 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
732 if ((error = xfs_btree_decrement(cur, 0, &i))) 512 error = xfs_btree_decrement(bma->cur, 0, &i);
513 if (error)
733 goto done; 514 goto done;
734 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 515 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
735 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 516 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
736 LEFT.br_startblock, 517 LEFT.br_startblock,
737 LEFT.br_blockcount + 518 LEFT.br_blockcount +
738 PREV.br_blockcount + 519 PREV.br_blockcount +
739 RIGHT.br_blockcount, LEFT.br_state))) 520 RIGHT.br_blockcount, LEFT.br_state);
521 if (error)
740 goto done; 522 goto done;
741 } 523 }
742 *dnew = 0;
743 break; 524 break;
744 525
745 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 526 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real(
747 * Filling in all of a previously delayed allocation extent. 528 * Filling in all of a previously delayed allocation extent.
748 * The left neighbor is contiguous, the right is not. 529 * The left neighbor is contiguous, the right is not.
749 */ 530 */
750 --*idx; 531 bma->idx--;
751 532
752 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 533 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
753 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 534 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
754 LEFT.br_blockcount + PREV.br_blockcount); 535 LEFT.br_blockcount + PREV.br_blockcount);
755 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 536 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
756 537
757 xfs_iext_remove(ip, *idx + 1, 1, state); 538 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
758 if (cur == NULL) 539 if (bma->cur == NULL)
759 rval = XFS_ILOG_DEXT; 540 rval = XFS_ILOG_DEXT;
760 else { 541 else {
761 rval = 0; 542 rval = 0;
762 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, 543 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
763 LEFT.br_startblock, LEFT.br_blockcount, 544 LEFT.br_startblock, LEFT.br_blockcount,
764 &i))) 545 &i);
546 if (error)
765 goto done; 547 goto done;
766 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 548 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
767 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 549 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
768 LEFT.br_startblock, 550 LEFT.br_startblock,
769 LEFT.br_blockcount + 551 LEFT.br_blockcount +
770 PREV.br_blockcount, LEFT.br_state))) 552 PREV.br_blockcount, LEFT.br_state);
553 if (error)
771 goto done; 554 goto done;
772 } 555 }
773 *dnew = 0;
774 break; 556 break;
775 557
776 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 558 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real(
778 * Filling in all of a previously delayed allocation extent. 560 * Filling in all of a previously delayed allocation extent.
779 * The right neighbor is contiguous, the left is not. 561 * The right neighbor is contiguous, the left is not.
780 */ 562 */
781 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 563 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
782 xfs_bmbt_set_startblock(ep, new->br_startblock); 564 xfs_bmbt_set_startblock(ep, new->br_startblock);
783 xfs_bmbt_set_blockcount(ep, 565 xfs_bmbt_set_blockcount(ep,
784 PREV.br_blockcount + RIGHT.br_blockcount); 566 PREV.br_blockcount + RIGHT.br_blockcount);
785 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 567 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
786 568
787 xfs_iext_remove(ip, *idx + 1, 1, state); 569 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
788 if (cur == NULL) 570 if (bma->cur == NULL)
789 rval = XFS_ILOG_DEXT; 571 rval = XFS_ILOG_DEXT;
790 else { 572 else {
791 rval = 0; 573 rval = 0;
792 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 574 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
793 RIGHT.br_startblock, 575 RIGHT.br_startblock,
794 RIGHT.br_blockcount, &i))) 576 RIGHT.br_blockcount, &i);
577 if (error)
795 goto done; 578 goto done;
796 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 579 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
797 if ((error = xfs_bmbt_update(cur, PREV.br_startoff, 580 error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
798 new->br_startblock, 581 new->br_startblock,
799 PREV.br_blockcount + 582 PREV.br_blockcount +
800 RIGHT.br_blockcount, PREV.br_state))) 583 RIGHT.br_blockcount, PREV.br_state);
584 if (error)
801 goto done; 585 goto done;
802 } 586 }
803
804 *dnew = 0;
805 break; 587 break;
806 588
807 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 589 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real(
810 * Neither the left nor right neighbors are contiguous with 592 * Neither the left nor right neighbors are contiguous with
811 * the new one. 593 * the new one.
812 */ 594 */
813 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 595 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
814 xfs_bmbt_set_startblock(ep, new->br_startblock); 596 xfs_bmbt_set_startblock(ep, new->br_startblock);
815 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 597 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
816 598
817 ip->i_d.di_nextents++; 599 bma->ip->i_d.di_nextents++;
818 if (cur == NULL) 600 if (bma->cur == NULL)
819 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 601 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
820 else { 602 else {
821 rval = XFS_ILOG_CORE; 603 rval = XFS_ILOG_CORE;
822 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 604 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
823 new->br_startblock, new->br_blockcount, 605 new->br_startblock, new->br_blockcount,
824 &i))) 606 &i);
607 if (error)
825 goto done; 608 goto done;
826 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 609 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
827 cur->bc_rec.b.br_state = XFS_EXT_NORM; 610 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
828 if ((error = xfs_btree_insert(cur, &i))) 611 error = xfs_btree_insert(bma->cur, &i);
612 if (error)
829 goto done; 613 goto done;
830 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 614 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
831 } 615 }
832
833 *dnew = 0;
834 break; 616 break;
835 617
836 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 618 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real(
838 * Filling in the first part of a previous delayed allocation. 620 * Filling in the first part of a previous delayed allocation.
839 * The left neighbor is contiguous. 621 * The left neighbor is contiguous.
840 */ 622 */
841 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); 623 trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
842 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), 624 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
843 LEFT.br_blockcount + new->br_blockcount); 625 LEFT.br_blockcount + new->br_blockcount);
844 xfs_bmbt_set_startoff(ep, 626 xfs_bmbt_set_startoff(ep,
845 PREV.br_startoff + new->br_blockcount); 627 PREV.br_startoff + new->br_blockcount);
846 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); 628 trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
847 629
848 temp = PREV.br_blockcount - new->br_blockcount; 630 temp = PREV.br_blockcount - new->br_blockcount;
849 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 631 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
850 xfs_bmbt_set_blockcount(ep, temp); 632 xfs_bmbt_set_blockcount(ep, temp);
851 if (cur == NULL) 633 if (bma->cur == NULL)
852 rval = XFS_ILOG_DEXT; 634 rval = XFS_ILOG_DEXT;
853 else { 635 else {
854 rval = 0; 636 rval = 0;
855 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, 637 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
856 LEFT.br_startblock, LEFT.br_blockcount, 638 LEFT.br_startblock, LEFT.br_blockcount,
857 &i))) 639 &i);
640 if (error)
858 goto done; 641 goto done;
859 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 642 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
860 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 643 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
861 LEFT.br_startblock, 644 LEFT.br_startblock,
862 LEFT.br_blockcount + 645 LEFT.br_blockcount +
863 new->br_blockcount, 646 new->br_blockcount,
864 LEFT.br_state))) 647 LEFT.br_state);
648 if (error)
865 goto done; 649 goto done;
866 } 650 }
867 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 651 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
868 startblockval(PREV.br_startblock)); 652 startblockval(PREV.br_startblock));
869 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 653 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
870 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 654 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
871 655
872 --*idx; 656 bma->idx--;
873 *dnew = temp;
874 break; 657 break;
875 658
876 case BMAP_LEFT_FILLING: 659 case BMAP_LEFT_FILLING:
@@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real(
878 * Filling in the first part of a previous delayed allocation. 661 * Filling in the first part of a previous delayed allocation.
879 * The left neighbor is not contiguous. 662 * The left neighbor is not contiguous.
880 */ 663 */
881 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 664 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
882 xfs_bmbt_set_startoff(ep, new_endoff); 665 xfs_bmbt_set_startoff(ep, new_endoff);
883 temp = PREV.br_blockcount - new->br_blockcount; 666 temp = PREV.br_blockcount - new->br_blockcount;
884 xfs_bmbt_set_blockcount(ep, temp); 667 xfs_bmbt_set_blockcount(ep, temp);
885 xfs_iext_insert(ip, *idx, 1, new, state); 668 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
886 ip->i_d.di_nextents++; 669 bma->ip->i_d.di_nextents++;
887 if (cur == NULL) 670 if (bma->cur == NULL)
888 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 671 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
889 else { 672 else {
890 rval = XFS_ILOG_CORE; 673 rval = XFS_ILOG_CORE;
891 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 674 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
892 new->br_startblock, new->br_blockcount, 675 new->br_startblock, new->br_blockcount,
893 &i))) 676 &i);
677 if (error)
894 goto done; 678 goto done;
895 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 679 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
896 cur->bc_rec.b.br_state = XFS_EXT_NORM; 680 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
897 if ((error = xfs_btree_insert(cur, &i))) 681 error = xfs_btree_insert(bma->cur, &i);
682 if (error)
898 goto done; 683 goto done;
899 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 684 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
900 } 685 }
901 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 686 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
902 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 687 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
903 error = xfs_bmap_extents_to_btree(tp, ip, 688 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
904 first, flist, &cur, 1, &tmp_rval, 689 bma->firstblock, bma->flist,
905 XFS_DATA_FORK); 690 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
906 rval |= tmp_rval; 691 rval |= tmp_rval;
907 if (error) 692 if (error)
908 goto done; 693 goto done;
909 } 694 }
910 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 695 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
911 startblockval(PREV.br_startblock) - 696 startblockval(PREV.br_startblock) -
912 (cur ? cur->bc_private.b.allocated : 0)); 697 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
913 ep = xfs_iext_get_ext(ifp, *idx + 1); 698 ep = xfs_iext_get_ext(ifp, bma->idx + 1);
914 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 699 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
915 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); 700 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
916
917 *dnew = temp;
918 break; 701 break;
919 702
920 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 703 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real(
923 * The right neighbor is contiguous with the new allocation. 706 * The right neighbor is contiguous with the new allocation.
924 */ 707 */
925 temp = PREV.br_blockcount - new->br_blockcount; 708 temp = PREV.br_blockcount - new->br_blockcount;
926 trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); 709 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
927 xfs_bmbt_set_blockcount(ep, temp); 710 xfs_bmbt_set_blockcount(ep, temp);
928 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), 711 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
929 new->br_startoff, new->br_startblock, 712 new->br_startoff, new->br_startblock,
930 new->br_blockcount + RIGHT.br_blockcount, 713 new->br_blockcount + RIGHT.br_blockcount,
931 RIGHT.br_state); 714 RIGHT.br_state);
932 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); 715 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
933 if (cur == NULL) 716 if (bma->cur == NULL)
934 rval = XFS_ILOG_DEXT; 717 rval = XFS_ILOG_DEXT;
935 else { 718 else {
936 rval = 0; 719 rval = 0;
937 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 720 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
938 RIGHT.br_startblock, 721 RIGHT.br_startblock,
939 RIGHT.br_blockcount, &i))) 722 RIGHT.br_blockcount, &i);
723 if (error)
940 goto done; 724 goto done;
941 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 725 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
942 if ((error = xfs_bmbt_update(cur, new->br_startoff, 726 error = xfs_bmbt_update(bma->cur, new->br_startoff,
943 new->br_startblock, 727 new->br_startblock,
944 new->br_blockcount + 728 new->br_blockcount +
945 RIGHT.br_blockcount, 729 RIGHT.br_blockcount,
946 RIGHT.br_state))) 730 RIGHT.br_state);
731 if (error)
947 goto done; 732 goto done;
948 } 733 }
949 734
950 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 735 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
951 startblockval(PREV.br_startblock)); 736 startblockval(PREV.br_startblock));
952 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 737 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
953 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 738 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
954 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 739 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
955 740
956 ++*idx; 741 bma->idx++;
957 *dnew = temp;
958 break; 742 break;
959 743
960 case BMAP_RIGHT_FILLING: 744 case BMAP_RIGHT_FILLING:
@@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real(
963 * The right neighbor is not contiguous. 747 * The right neighbor is not contiguous.
964 */ 748 */
965 temp = PREV.br_blockcount - new->br_blockcount; 749 temp = PREV.br_blockcount - new->br_blockcount;
966 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 750 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
967 xfs_bmbt_set_blockcount(ep, temp); 751 xfs_bmbt_set_blockcount(ep, temp);
968 xfs_iext_insert(ip, *idx + 1, 1, new, state); 752 xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
969 ip->i_d.di_nextents++; 753 bma->ip->i_d.di_nextents++;
970 if (cur == NULL) 754 if (bma->cur == NULL)
971 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 755 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
972 else { 756 else {
973 rval = XFS_ILOG_CORE; 757 rval = XFS_ILOG_CORE;
974 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 758 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
975 new->br_startblock, new->br_blockcount, 759 new->br_startblock, new->br_blockcount,
976 &i))) 760 &i);
761 if (error)
977 goto done; 762 goto done;
978 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 763 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
979 cur->bc_rec.b.br_state = XFS_EXT_NORM; 764 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
980 if ((error = xfs_btree_insert(cur, &i))) 765 error = xfs_btree_insert(bma->cur, &i);
766 if (error)
981 goto done; 767 goto done;
982 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 768 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
983 } 769 }
984 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 770 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
985 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 771 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
986 error = xfs_bmap_extents_to_btree(tp, ip, 772 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
987 first, flist, &cur, 1, &tmp_rval, 773 bma->firstblock, bma->flist, &bma->cur, 1,
988 XFS_DATA_FORK); 774 &tmp_rval, XFS_DATA_FORK);
989 rval |= tmp_rval; 775 rval |= tmp_rval;
990 if (error) 776 if (error)
991 goto done; 777 goto done;
992 } 778 }
993 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 779 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
994 startblockval(PREV.br_startblock) - 780 startblockval(PREV.br_startblock) -
995 (cur ? cur->bc_private.b.allocated : 0)); 781 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
996 ep = xfs_iext_get_ext(ifp, *idx); 782 ep = xfs_iext_get_ext(ifp, bma->idx);
997 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 783 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
998 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 784 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
999 785
1000 ++*idx; 786 bma->idx++;
1001 *dnew = temp;
1002 break; 787 break;
1003 788
1004 case 0: 789 case 0:
@@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real(
1024 */ 809 */
1025 temp = new->br_startoff - PREV.br_startoff; 810 temp = new->br_startoff - PREV.br_startoff;
1026 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 811 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1027 trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); 812 trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
1028 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ 813 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1029 LEFT = *new; 814 LEFT = *new;
1030 RIGHT.br_state = PREV.br_state; 815 RIGHT.br_state = PREV.br_state;
1031 RIGHT.br_startblock = nullstartblock( 816 RIGHT.br_startblock = nullstartblock(
1032 (int)xfs_bmap_worst_indlen(ip, temp2)); 817 (int)xfs_bmap_worst_indlen(bma->ip, temp2));
1033 RIGHT.br_startoff = new_endoff; 818 RIGHT.br_startoff = new_endoff;
1034 RIGHT.br_blockcount = temp2; 819 RIGHT.br_blockcount = temp2;
1035 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ 820 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1036 xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); 821 xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
1037 ip->i_d.di_nextents++; 822 bma->ip->i_d.di_nextents++;
1038 if (cur == NULL) 823 if (bma->cur == NULL)
1039 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 824 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1040 else { 825 else {
1041 rval = XFS_ILOG_CORE; 826 rval = XFS_ILOG_CORE;
1042 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 827 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
1043 new->br_startblock, new->br_blockcount, 828 new->br_startblock, new->br_blockcount,
1044 &i))) 829 &i);
830 if (error)
1045 goto done; 831 goto done;
1046 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 832 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1047 cur->bc_rec.b.br_state = XFS_EXT_NORM; 833 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
1048 if ((error = xfs_btree_insert(cur, &i))) 834 error = xfs_btree_insert(bma->cur, &i);
835 if (error)
1049 goto done; 836 goto done;
1050 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 837 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1051 } 838 }
1052 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 839 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1053 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 840 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
1054 error = xfs_bmap_extents_to_btree(tp, ip, 841 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
1055 first, flist, &cur, 1, &tmp_rval, 842 bma->firstblock, bma->flist, &bma->cur,
1056 XFS_DATA_FORK); 843 1, &tmp_rval, XFS_DATA_FORK);
1057 rval |= tmp_rval; 844 rval |= tmp_rval;
1058 if (error) 845 if (error)
1059 goto done; 846 goto done;
1060 } 847 }
1061 temp = xfs_bmap_worst_indlen(ip, temp); 848 temp = xfs_bmap_worst_indlen(bma->ip, temp);
1062 temp2 = xfs_bmap_worst_indlen(ip, temp2); 849 temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
1063 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 850 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1064 (cur ? cur->bc_private.b.allocated : 0)); 851 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
1065 if (diff > 0 && 852 if (diff > 0) {
1066 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 853 error = xfs_icsb_modify_counters(bma->ip->i_mount,
1067 -((int64_t)diff), 0)) { 854 XFS_SBS_FDBLOCKS,
1068 /* 855 -((int64_t)diff), 0);
1069 * Ick gross gag me with a spoon. 856 ASSERT(!error);
1070 */ 857 if (error)
1071 ASSERT(0); /* want to see if this ever happens! */ 858 goto done;
1072 while (diff > 0) {
1073 if (temp) {
1074 temp--;
1075 diff--;
1076 if (!diff ||
1077 !xfs_icsb_modify_counters(ip->i_mount,
1078 XFS_SBS_FDBLOCKS,
1079 -((int64_t)diff), 0))
1080 break;
1081 }
1082 if (temp2) {
1083 temp2--;
1084 diff--;
1085 if (!diff ||
1086 !xfs_icsb_modify_counters(ip->i_mount,
1087 XFS_SBS_FDBLOCKS,
1088 -((int64_t)diff), 0))
1089 break;
1090 }
1091 }
1092 } 859 }
1093 ep = xfs_iext_get_ext(ifp, *idx); 860
861 ep = xfs_iext_get_ext(ifp, bma->idx);
1094 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 862 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1095 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 863 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1096 trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); 864 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
1097 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), 865 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
1098 nullstartblock((int)temp2)); 866 nullstartblock((int)temp2));
1099 trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); 867 trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
1100 868
1101 ++*idx; 869 bma->idx++;
1102 *dnew = temp + temp2; 870 da_new = temp + temp2;
1103 break; 871 break;
1104 872
1105 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 873 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real(
1114 */ 882 */
1115 ASSERT(0); 883 ASSERT(0);
1116 } 884 }
1117 *curp = cur; 885
886 /* convert to a btree if necessary */
887 if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
888 XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
889 int tmp_logflags; /* partial log flag return val */
890
891 ASSERT(bma->cur == NULL);
892 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
893 bma->firstblock, bma->flist, &bma->cur,
894 da_old > 0, &tmp_logflags, XFS_DATA_FORK);
895 bma->logflags |= tmp_logflags;
896 if (error)
897 goto done;
898 }
899
900 /* adjust for changes in reserved delayed indirect blocks */
901 if (da_old || da_new) {
902 temp = da_new;
903 if (bma->cur)
904 temp += bma->cur->bc_private.b.allocated;
905 ASSERT(temp <= da_old);
906 if (temp < da_old)
907 xfs_icsb_modify_counters(bma->ip->i_mount,
908 XFS_SBS_FDBLOCKS,
909 (int64_t)(da_old - temp), 0);
910 }
911
912 /* clear out the allocated field, done with it now in any case. */
913 if (bma->cur)
914 bma->cur->bc_private.b.allocated = 0;
915
916 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
1118done: 917done:
1119 *logflagsp = rval; 918 bma->logflags |= rval;
1120 return error; 919 return error;
1121#undef LEFT 920#undef LEFT
1122#undef RIGHT 921#undef RIGHT
@@ -1124,15 +923,17 @@ done:
1124} 923}
1125 924
1126/* 925/*
1127 * Called by xfs_bmap_add_extent to handle cases converting an unwritten 926 * Convert an unwritten allocation to a real allocation or vice versa.
1128 * allocation to a real allocation or vice versa.
1129 */ 927 */
1130STATIC int /* error */ 928STATIC int /* error */
1131xfs_bmap_add_extent_unwritten_real( 929xfs_bmap_add_extent_unwritten_real(
930 struct xfs_trans *tp,
1132 xfs_inode_t *ip, /* incore inode pointer */ 931 xfs_inode_t *ip, /* incore inode pointer */
1133 xfs_extnum_t *idx, /* extent number to update/insert */ 932 xfs_extnum_t *idx, /* extent number to update/insert */
1134 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 933 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1135 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 934 xfs_bmbt_irec_t *new, /* new data to add to file extents */
935 xfs_fsblock_t *first, /* pointer to firstblock variable */
936 xfs_bmap_free_t *flist, /* list of extents to be freed */
1136 int *logflagsp) /* inode logging flags */ 937 int *logflagsp) /* inode logging flags */
1137{ 938{
1138 xfs_btree_cur_t *cur; /* btree cursor */ 939 xfs_btree_cur_t *cur; /* btree cursor */
@@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real(
1148 int rval=0; /* return value (logging flags) */ 949 int rval=0; /* return value (logging flags) */
1149 int state = 0;/* state bits, accessed thru macros */ 950 int state = 0;/* state bits, accessed thru macros */
1150 951
952 *logflagsp = 0;
953
954 cur = *curp;
955 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
956
957 ASSERT(*idx >= 0);
958 ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
959 ASSERT(!isnullstartblock(new->br_startblock));
960
961 XFS_STATS_INC(xs_add_exlist);
962
1151#define LEFT r[0] 963#define LEFT r[0]
1152#define RIGHT r[1] 964#define RIGHT r[1]
1153#define PREV r[2] 965#define PREV r[2]
966
1154 /* 967 /*
1155 * Set up a bunch of variables to make the tests simpler. 968 * Set up a bunch of variables to make the tests simpler.
1156 */ 969 */
1157 error = 0; 970 error = 0;
1158 cur = *curp;
1159 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1160 ep = xfs_iext_get_ext(ifp, *idx); 971 ep = xfs_iext_get_ext(ifp, *idx);
1161 xfs_bmbt_get_all(ep, &PREV); 972 xfs_bmbt_get_all(ep, &PREV);
1162 newext = new->br_state; 973 newext = new->br_state;
@@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
1406 goto done; 1217 goto done;
1407 if ((error = xfs_btree_decrement(cur, 0, &i))) 1218 if ((error = xfs_btree_decrement(cur, 0, &i)))
1408 goto done; 1219 goto done;
1409 if (xfs_bmbt_update(cur, LEFT.br_startoff, 1220 error = xfs_bmbt_update(cur, LEFT.br_startoff,
1410 LEFT.br_startblock, 1221 LEFT.br_startblock,
1411 LEFT.br_blockcount + new->br_blockcount, 1222 LEFT.br_blockcount + new->br_blockcount,
1412 LEFT.br_state)) 1223 LEFT.br_state);
1224 if (error)
1413 goto done; 1225 goto done;
1414 } 1226 }
1415 break; 1227 break;
@@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real(
1607 */ 1419 */
1608 ASSERT(0); 1420 ASSERT(0);
1609 } 1421 }
1610 *curp = cur; 1422
1423 /* convert to a btree if necessary */
1424 if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
1425 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
1426 int tmp_logflags; /* partial log flag return val */
1427
1428 ASSERT(cur == NULL);
1429 error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
1430 0, &tmp_logflags, XFS_DATA_FORK);
1431 *logflagsp |= tmp_logflags;
1432 if (error)
1433 goto done;
1434 }
1435
1436 /* clear out the allocated field, done with it now in any case. */
1437 if (cur) {
1438 cur->bc_private.b.allocated = 0;
1439 *curp = cur;
1440 }
1441
1442 xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
1611done: 1443done:
1612 *logflagsp = rval; 1444 *logflagsp |= rval;
1613 return error; 1445 return error;
1614#undef LEFT 1446#undef LEFT
1615#undef RIGHT 1447#undef RIGHT
@@ -1617,16 +1449,13 @@ done:
1617} 1449}
1618 1450
1619/* 1451/*
1620 * Called by xfs_bmap_add_extent to handle cases converting a hole 1452 * Convert a hole to a delayed allocation.
1621 * to a delayed allocation.
1622 */ 1453 */
1623/*ARGSUSED*/ 1454STATIC void
1624STATIC int /* error */
1625xfs_bmap_add_extent_hole_delay( 1455xfs_bmap_add_extent_hole_delay(
1626 xfs_inode_t *ip, /* incore inode pointer */ 1456 xfs_inode_t *ip, /* incore inode pointer */
1627 xfs_extnum_t *idx, /* extent number to update/insert */ 1457 xfs_extnum_t *idx, /* extent number to update/insert */
1628 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1458 xfs_bmbt_irec_t *new) /* new data to add to file extents */
1629 int *logflagsp) /* inode logging flags */
1630{ 1459{
1631 xfs_ifork_t *ifp; /* inode fork pointer */ 1460 xfs_ifork_t *ifp; /* inode fork pointer */
1632 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1461 xfs_bmbt_irec_t left; /* left neighbor extent entry */
@@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay(
1761 * Nothing to do for disk quota accounting here. 1590 * Nothing to do for disk quota accounting here.
1762 */ 1591 */
1763 } 1592 }
1764 *logflagsp = 0;
1765 return 0;
1766} 1593}
1767 1594
1768/* 1595/*
1769 * Called by xfs_bmap_add_extent to handle cases converting a hole 1596 * Convert a hole to a real allocation.
1770 * to a real allocation.
1771 */ 1597 */
1772STATIC int /* error */ 1598STATIC int /* error */
1773xfs_bmap_add_extent_hole_real( 1599xfs_bmap_add_extent_hole_real(
1774 xfs_inode_t *ip, /* incore inode pointer */ 1600 struct xfs_bmalloca *bma,
1775 xfs_extnum_t *idx, /* extent number to update/insert */ 1601 int whichfork)
1776 xfs_btree_cur_t *cur, /* if null, not a btree */
1777 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1778 int *logflagsp, /* inode logging flags */
1779 int whichfork) /* data or attr fork */
1780{ 1602{
1603 struct xfs_bmbt_irec *new = &bma->got;
1781 int error; /* error return value */ 1604 int error; /* error return value */
1782 int i; /* temp state */ 1605 int i; /* temp state */
1783 xfs_ifork_t *ifp; /* inode fork pointer */ 1606 xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real(
1786 int rval=0; /* return value (logging flags) */ 1609 int rval=0; /* return value (logging flags) */
1787 int state; /* state bits, accessed thru macros */ 1610 int state; /* state bits, accessed thru macros */
1788 1611
1789 ifp = XFS_IFORK_PTR(ip, whichfork); 1612 ifp = XFS_IFORK_PTR(bma->ip, whichfork);
1790 ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1613
1791 state = 0; 1614 ASSERT(bma->idx >= 0);
1615 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
1616 ASSERT(!isnullstartblock(new->br_startblock));
1617 ASSERT(!bma->cur ||
1618 !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
1619
1620 XFS_STATS_INC(xs_add_exlist);
1792 1621
1622 state = 0;
1793 if (whichfork == XFS_ATTR_FORK) 1623 if (whichfork == XFS_ATTR_FORK)
1794 state |= BMAP_ATTRFORK; 1624 state |= BMAP_ATTRFORK;
1795 1625
1796 /* 1626 /*
1797 * Check and set flags if this segment has a left neighbor. 1627 * Check and set flags if this segment has a left neighbor.
1798 */ 1628 */
1799 if (*idx > 0) { 1629 if (bma->idx > 0) {
1800 state |= BMAP_LEFT_VALID; 1630 state |= BMAP_LEFT_VALID;
1801 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); 1631 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
1802 if (isnullstartblock(left.br_startblock)) 1632 if (isnullstartblock(left.br_startblock))
1803 state |= BMAP_LEFT_DELAY; 1633 state |= BMAP_LEFT_DELAY;
1804 } 1634 }
@@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real(
1807 * Check and set flags if this segment has a current value. 1637 * Check and set flags if this segment has a current value.
1808 * Not true if we're inserting into the "hole" at eof. 1638 * Not true if we're inserting into the "hole" at eof.
1809 */ 1639 */
1810 if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1640 if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1811 state |= BMAP_RIGHT_VALID; 1641 state |= BMAP_RIGHT_VALID;
1812 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); 1642 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
1813 if (isnullstartblock(right.br_startblock)) 1643 if (isnullstartblock(right.br_startblock))
1814 state |= BMAP_RIGHT_DELAY; 1644 state |= BMAP_RIGHT_DELAY;
1815 } 1645 }
@@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real(
1846 * left and on the right. 1676 * left and on the right.
1847 * Merge all three into a single extent record. 1677 * Merge all three into a single extent record.
1848 */ 1678 */
1849 --*idx; 1679 --bma->idx;
1850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1680 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1851 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 1681 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1852 left.br_blockcount + new->br_blockcount + 1682 left.br_blockcount + new->br_blockcount +
1853 right.br_blockcount); 1683 right.br_blockcount);
1854 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1684 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1855 1685
1856 xfs_iext_remove(ip, *idx + 1, 1, state); 1686 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
1857 1687
1858 XFS_IFORK_NEXT_SET(ip, whichfork, 1688 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
1859 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1689 XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
1860 if (cur == NULL) { 1690 if (bma->cur == NULL) {
1861 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 1691 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
1862 } else { 1692 } else {
1863 rval = XFS_ILOG_CORE; 1693 rval = XFS_ILOG_CORE;
1864 if ((error = xfs_bmbt_lookup_eq(cur, 1694 error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
1865 right.br_startoff, 1695 right.br_startblock, right.br_blockcount,
1866 right.br_startblock, 1696 &i);
1867 right.br_blockcount, &i))) 1697 if (error)
1868 goto done; 1698 goto done;
1869 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1699 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1870 if ((error = xfs_btree_delete(cur, &i))) 1700 error = xfs_btree_delete(bma->cur, &i);
1701 if (error)
1871 goto done; 1702 goto done;
1872 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1703 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1873 if ((error = xfs_btree_decrement(cur, 0, &i))) 1704 error = xfs_btree_decrement(bma->cur, 0, &i);
1705 if (error)
1874 goto done; 1706 goto done;
1875 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1707 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1876 if ((error = xfs_bmbt_update(cur, left.br_startoff, 1708 error = xfs_bmbt_update(bma->cur, left.br_startoff,
1877 left.br_startblock, 1709 left.br_startblock,
1878 left.br_blockcount + 1710 left.br_blockcount +
1879 new->br_blockcount + 1711 new->br_blockcount +
1880 right.br_blockcount, 1712 right.br_blockcount,
1881 left.br_state))) 1713 left.br_state);
1714 if (error)
1882 goto done; 1715 goto done;
1883 } 1716 }
1884 break; 1717 break;
@@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real(
1889 * on the left. 1722 * on the left.
1890 * Merge the new allocation with the left neighbor. 1723 * Merge the new allocation with the left neighbor.
1891 */ 1724 */
1892 --*idx; 1725 --bma->idx;
1893 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1726 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1894 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 1727 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1895 left.br_blockcount + new->br_blockcount); 1728 left.br_blockcount + new->br_blockcount);
1896 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1729 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1897 1730
1898 if (cur == NULL) { 1731 if (bma->cur == NULL) {
1899 rval = xfs_ilog_fext(whichfork); 1732 rval = xfs_ilog_fext(whichfork);
1900 } else { 1733 } else {
1901 rval = 0; 1734 rval = 0;
1902 if ((error = xfs_bmbt_lookup_eq(cur, 1735 error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
1903 left.br_startoff, 1736 left.br_startblock, left.br_blockcount,
1904 left.br_startblock, 1737 &i);
1905 left.br_blockcount, &i))) 1738 if (error)
1906 goto done; 1739 goto done;
1907 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1740 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1908 if ((error = xfs_bmbt_update(cur, left.br_startoff, 1741 error = xfs_bmbt_update(bma->cur, left.br_startoff,
1909 left.br_startblock, 1742 left.br_startblock,
1910 left.br_blockcount + 1743 left.br_blockcount +
1911 new->br_blockcount, 1744 new->br_blockcount,
1912 left.br_state))) 1745 left.br_state);
1746 if (error)
1913 goto done; 1747 goto done;
1914 } 1748 }
1915 break; 1749 break;
@@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real(
1920 * on the right. 1754 * on the right.
1921 * Merge the new allocation with the right neighbor. 1755 * Merge the new allocation with the right neighbor.
1922 */ 1756 */
1923 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1757 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1924 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), 1758 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
1925 new->br_startoff, new->br_startblock, 1759 new->br_startoff, new->br_startblock,
1926 new->br_blockcount + right.br_blockcount, 1760 new->br_blockcount + right.br_blockcount,
1927 right.br_state); 1761 right.br_state);
1928 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1762 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1929 1763
1930 if (cur == NULL) { 1764 if (bma->cur == NULL) {
1931 rval = xfs_ilog_fext(whichfork); 1765 rval = xfs_ilog_fext(whichfork);
1932 } else { 1766 } else {
1933 rval = 0; 1767 rval = 0;
1934 if ((error = xfs_bmbt_lookup_eq(cur, 1768 error = xfs_bmbt_lookup_eq(bma->cur,
1935 right.br_startoff, 1769 right.br_startoff,
1936 right.br_startblock, 1770 right.br_startblock,
1937 right.br_blockcount, &i))) 1771 right.br_blockcount, &i);
1772 if (error)
1938 goto done; 1773 goto done;
1939 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1774 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1940 if ((error = xfs_bmbt_update(cur, new->br_startoff, 1775 error = xfs_bmbt_update(bma->cur, new->br_startoff,
1941 new->br_startblock, 1776 new->br_startblock,
1942 new->br_blockcount + 1777 new->br_blockcount +
1943 right.br_blockcount, 1778 right.br_blockcount,
1944 right.br_state))) 1779 right.br_state);
1780 if (error)
1945 goto done; 1781 goto done;
1946 } 1782 }
1947 break; 1783 break;
@@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real(
1952 * real allocation. 1788 * real allocation.
1953 * Insert a new entry. 1789 * Insert a new entry.
1954 */ 1790 */
1955 xfs_iext_insert(ip, *idx, 1, new, state); 1791 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
1956 XFS_IFORK_NEXT_SET(ip, whichfork, 1792 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
1957 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 1793 XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
1958 if (cur == NULL) { 1794 if (bma->cur == NULL) {
1959 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 1795 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
1960 } else { 1796 } else {
1961 rval = XFS_ILOG_CORE; 1797 rval = XFS_ILOG_CORE;
1962 if ((error = xfs_bmbt_lookup_eq(cur, 1798 error = xfs_bmbt_lookup_eq(bma->cur,
1963 new->br_startoff, 1799 new->br_startoff,
1964 new->br_startblock, 1800 new->br_startblock,
1965 new->br_blockcount, &i))) 1801 new->br_blockcount, &i);
1802 if (error)
1966 goto done; 1803 goto done;
1967 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1804 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1968 cur->bc_rec.b.br_state = new->br_state; 1805 bma->cur->bc_rec.b.br_state = new->br_state;
1969 if ((error = xfs_btree_insert(cur, &i))) 1806 error = xfs_btree_insert(bma->cur, &i);
1807 if (error)
1970 goto done; 1808 goto done;
1971 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1809 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1972 } 1810 }
1973 break; 1811 break;
1974 } 1812 }
1813
1814 /* convert to a btree if necessary */
1815 if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
1816 XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
1817 int tmp_logflags; /* partial log flag return val */
1818
1819 ASSERT(bma->cur == NULL);
1820 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
1821 bma->firstblock, bma->flist, &bma->cur,
1822 0, &tmp_logflags, whichfork);
1823 bma->logflags |= tmp_logflags;
1824 if (error)
1825 goto done;
1826 }
1827
1828 /* clear out the allocated field, done with it now in any case. */
1829 if (bma->cur)
1830 bma->cur->bc_private.b.allocated = 0;
1831
1832 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
1975done: 1833done:
1976 *logflagsp = rval; 1834 bma->logflags |= rval;
1977 return error; 1835 return error;
1978} 1836}
1979 1837
@@ -2160,26 +2018,26 @@ xfs_bmap_adjacent(
2160 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) 2018 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
2161 2019
2162 mp = ap->ip->i_mount; 2020 mp = ap->ip->i_mount;
2163 nullfb = ap->firstblock == NULLFSBLOCK; 2021 nullfb = *ap->firstblock == NULLFSBLOCK;
2164 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; 2022 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
2165 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2023 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
2166 /* 2024 /*
2167 * If allocating at eof, and there's a previous real block, 2025 * If allocating at eof, and there's a previous real block,
2168 * try to use its last block as our starting point. 2026 * try to use its last block as our starting point.
2169 */ 2027 */
2170 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2028 if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
2171 !isnullstartblock(ap->prevp->br_startblock) && 2029 !isnullstartblock(ap->prev.br_startblock) &&
2172 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, 2030 ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
2173 ap->prevp->br_startblock)) { 2031 ap->prev.br_startblock)) {
2174 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; 2032 ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
2175 /* 2033 /*
2176 * Adjust for the gap between prevp and us. 2034 * Adjust for the gap between prevp and us.
2177 */ 2035 */
2178 adjust = ap->off - 2036 adjust = ap->offset -
2179 (ap->prevp->br_startoff + ap->prevp->br_blockcount); 2037 (ap->prev.br_startoff + ap->prev.br_blockcount);
2180 if (adjust && 2038 if (adjust &&
2181 ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) 2039 ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
2182 ap->rval += adjust; 2040 ap->blkno += adjust;
2183 } 2041 }
2184 /* 2042 /*
2185 * If not at eof, then compare the two neighbor blocks. 2043 * If not at eof, then compare the two neighbor blocks.
@@ -2196,17 +2054,17 @@ xfs_bmap_adjacent(
2196 * If there's a previous (left) block, select a requested 2054 * If there's a previous (left) block, select a requested
2197 * start block based on it. 2055 * start block based on it.
2198 */ 2056 */
2199 if (ap->prevp->br_startoff != NULLFILEOFF && 2057 if (ap->prev.br_startoff != NULLFILEOFF &&
2200 !isnullstartblock(ap->prevp->br_startblock) && 2058 !isnullstartblock(ap->prev.br_startblock) &&
2201 (prevbno = ap->prevp->br_startblock + 2059 (prevbno = ap->prev.br_startblock +
2202 ap->prevp->br_blockcount) && 2060 ap->prev.br_blockcount) &&
2203 ISVALID(prevbno, ap->prevp->br_startblock)) { 2061 ISVALID(prevbno, ap->prev.br_startblock)) {
2204 /* 2062 /*
2205 * Calculate gap to end of previous block. 2063 * Calculate gap to end of previous block.
2206 */ 2064 */
2207 adjust = prevdiff = ap->off - 2065 adjust = prevdiff = ap->offset -
2208 (ap->prevp->br_startoff + 2066 (ap->prev.br_startoff +
2209 ap->prevp->br_blockcount); 2067 ap->prev.br_blockcount);
2210 /* 2068 /*
2211 * Figure the startblock based on the previous block's 2069 * Figure the startblock based on the previous block's
2212 * end and the gap size. 2070 * end and the gap size.
@@ -2215,9 +2073,9 @@ xfs_bmap_adjacent(
2215 * allocating, or using it gives us an invalid block 2073 * allocating, or using it gives us an invalid block
2216 * number, then just use the end of the previous block. 2074 * number, then just use the end of the previous block.
2217 */ 2075 */
2218 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && 2076 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
2219 ISVALID(prevbno + prevdiff, 2077 ISVALID(prevbno + prevdiff,
2220 ap->prevp->br_startblock)) 2078 ap->prev.br_startblock))
2221 prevbno += adjust; 2079 prevbno += adjust;
2222 else 2080 else
2223 prevdiff += adjust; 2081 prevdiff += adjust;
@@ -2238,16 +2096,16 @@ xfs_bmap_adjacent(
2238 * If there's a following (right) block, select a requested 2096 * If there's a following (right) block, select a requested
2239 * start block based on it. 2097 * start block based on it.
2240 */ 2098 */
2241 if (!isnullstartblock(ap->gotp->br_startblock)) { 2099 if (!isnullstartblock(ap->got.br_startblock)) {
2242 /* 2100 /*
2243 * Calculate gap to start of next block. 2101 * Calculate gap to start of next block.
2244 */ 2102 */
2245 adjust = gotdiff = ap->gotp->br_startoff - ap->off; 2103 adjust = gotdiff = ap->got.br_startoff - ap->offset;
2246 /* 2104 /*
2247 * Figure the startblock based on the next block's 2105 * Figure the startblock based on the next block's
2248 * start and the gap size. 2106 * start and the gap size.
2249 */ 2107 */
2250 gotbno = ap->gotp->br_startblock; 2108 gotbno = ap->got.br_startblock;
2251 /* 2109 /*
2252 * Heuristic! 2110 * Heuristic!
2253 * If the gap is large relative to the piece we're 2111 * If the gap is large relative to the piece we're
@@ -2255,12 +2113,12 @@ xfs_bmap_adjacent(
2255 * number, then just use the start of the next block 2113 * number, then just use the start of the next block
2256 * offset by our length. 2114 * offset by our length.
2257 */ 2115 */
2258 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && 2116 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
2259 ISVALID(gotbno - gotdiff, gotbno)) 2117 ISVALID(gotbno - gotdiff, gotbno))
2260 gotbno -= adjust; 2118 gotbno -= adjust;
2261 else if (ISVALID(gotbno - ap->alen, gotbno)) { 2119 else if (ISVALID(gotbno - ap->length, gotbno)) {
2262 gotbno -= ap->alen; 2120 gotbno -= ap->length;
2263 gotdiff += adjust - ap->alen; 2121 gotdiff += adjust - ap->length;
2264 } else 2122 } else
2265 gotdiff += adjust; 2123 gotdiff += adjust;
2266 /* 2124 /*
@@ -2278,14 +2136,14 @@ xfs_bmap_adjacent(
2278 gotbno = NULLFSBLOCK; 2136 gotbno = NULLFSBLOCK;
2279 /* 2137 /*
2280 * If both valid, pick the better one, else the only good 2138 * If both valid, pick the better one, else the only good
2281 * one, else ap->rval is already set (to 0 or the inode block). 2139 * one, else ap->blkno is already set (to 0 or the inode block).
2282 */ 2140 */
2283 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) 2141 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
2284 ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; 2142 ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
2285 else if (prevbno != NULLFSBLOCK) 2143 else if (prevbno != NULLFSBLOCK)
2286 ap->rval = prevbno; 2144 ap->blkno = prevbno;
2287 else if (gotbno != NULLFSBLOCK) 2145 else if (gotbno != NULLFSBLOCK)
2288 ap->rval = gotbno; 2146 ap->blkno = gotbno;
2289 } 2147 }
2290#undef ISVALID 2148#undef ISVALID
2291} 2149}
@@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc(
2305 mp = ap->ip->i_mount; 2163 mp = ap->ip->i_mount;
2306 align = xfs_get_extsz_hint(ap->ip); 2164 align = xfs_get_extsz_hint(ap->ip);
2307 prod = align / mp->m_sb.sb_rextsize; 2165 prod = align / mp->m_sb.sb_rextsize;
2308 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2166 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
2309 align, 1, ap->eof, 0, 2167 align, 1, ap->eof, 0,
2310 ap->conv, &ap->off, &ap->alen); 2168 ap->conv, &ap->offset, &ap->length);
2311 if (error) 2169 if (error)
2312 return error; 2170 return error;
2313 ASSERT(ap->alen); 2171 ASSERT(ap->length);
2314 ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); 2172 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
2315 2173
2316 /* 2174 /*
2317 * If the offset & length are not perfectly aligned 2175 * If the offset & length are not perfectly aligned
2318 * then kill prod, it will just get us in trouble. 2176 * then kill prod, it will just get us in trouble.
2319 */ 2177 */
2320 if (do_mod(ap->off, align) || ap->alen % align) 2178 if (do_mod(ap->offset, align) || ap->length % align)
2321 prod = 1; 2179 prod = 1;
2322 /* 2180 /*
2323 * Set ralen to be the actual requested length in rtextents. 2181 * Set ralen to be the actual requested length in rtextents.
2324 */ 2182 */
2325 ralen = ap->alen / mp->m_sb.sb_rextsize; 2183 ralen = ap->length / mp->m_sb.sb_rextsize;
2326 /* 2184 /*
2327 * If the old value was close enough to MAXEXTLEN that 2185 * If the old value was close enough to MAXEXTLEN that
2328 * we rounded up to it, cut it back so it's valid again. 2186 * we rounded up to it, cut it back so it's valid again.
@@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc(
2337 * Lock out other modifications to the RT bitmap inode. 2195 * Lock out other modifications to the RT bitmap inode.
2338 */ 2196 */
2339 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 2197 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2340 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); 2198 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2341 2199
2342 /* 2200 /*
2343 * If it's an allocation to an empty file at offset 0, 2201 * If it's an allocation to an empty file at offset 0,
2344 * pick an extent that will space things out in the rt area. 2202 * pick an extent that will space things out in the rt area.
2345 */ 2203 */
2346 if (ap->eof && ap->off == 0) { 2204 if (ap->eof && ap->offset == 0) {
2347 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ 2205 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
2348 2206
2349 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); 2207 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
2350 if (error) 2208 if (error)
2351 return error; 2209 return error;
2352 ap->rval = rtx * mp->m_sb.sb_rextsize; 2210 ap->blkno = rtx * mp->m_sb.sb_rextsize;
2353 } else { 2211 } else {
2354 ap->rval = 0; 2212 ap->blkno = 0;
2355 } 2213 }
2356 2214
2357 xfs_bmap_adjacent(ap); 2215 xfs_bmap_adjacent(ap);
@@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc(
2359 /* 2217 /*
2360 * Realtime allocation, done through xfs_rtallocate_extent. 2218 * Realtime allocation, done through xfs_rtallocate_extent.
2361 */ 2219 */
2362 atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; 2220 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
2363 do_div(ap->rval, mp->m_sb.sb_rextsize); 2221 do_div(ap->blkno, mp->m_sb.sb_rextsize);
2364 rtb = ap->rval; 2222 rtb = ap->blkno;
2365 ap->alen = ralen; 2223 ap->length = ralen;
2366 if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, 2224 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
2367 &ralen, atype, ap->wasdel, prod, &rtb))) 2225 &ralen, atype, ap->wasdel, prod, &rtb)))
2368 return error; 2226 return error;
2369 if (rtb == NULLFSBLOCK && prod > 1 && 2227 if (rtb == NULLFSBLOCK && prod > 1 &&
2370 (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, 2228 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
2371 ap->alen, &ralen, atype, 2229 ap->length, &ralen, atype,
2372 ap->wasdel, 1, &rtb))) 2230 ap->wasdel, 1, &rtb)))
2373 return error; 2231 return error;
2374 ap->rval = rtb; 2232 ap->blkno = rtb;
2375 if (ap->rval != NULLFSBLOCK) { 2233 if (ap->blkno != NULLFSBLOCK) {
2376 ap->rval *= mp->m_sb.sb_rextsize; 2234 ap->blkno *= mp->m_sb.sb_rextsize;
2377 ralen *= mp->m_sb.sb_rextsize; 2235 ralen *= mp->m_sb.sb_rextsize;
2378 ap->alen = ralen; 2236 ap->length = ralen;
2379 ap->ip->i_d.di_nblocks += ralen; 2237 ap->ip->i_d.di_nblocks += ralen;
2380 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); 2238 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2381 if (ap->wasdel) 2239 if (ap->wasdel)
@@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc(
2388 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : 2246 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
2389 XFS_TRANS_DQ_RTBCOUNT, (long) ralen); 2247 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
2390 } else { 2248 } else {
2391 ap->alen = 0; 2249 ap->length = 0;
2392 } 2250 }
2393 return 0; 2251 return 0;
2394} 2252}
@@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb(
2503 * AG as the stream may have moved. 2361 * AG as the stream may have moved.
2504 */ 2362 */
2505 if (xfs_inode_is_filestream(ap->ip)) 2363 if (xfs_inode_is_filestream(ap->ip))
2506 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); 2364 ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2507 2365
2508 return 0; 2366 return 0;
2509} 2367}
@@ -2528,52 +2386,52 @@ xfs_bmap_btalloc(
2528 mp = ap->ip->i_mount; 2386 mp = ap->ip->i_mount;
2529 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 2387 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
2530 if (unlikely(align)) { 2388 if (unlikely(align)) {
2531 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2389 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
2532 align, 0, ap->eof, 0, ap->conv, 2390 align, 0, ap->eof, 0, ap->conv,
2533 &ap->off, &ap->alen); 2391 &ap->offset, &ap->length);
2534 ASSERT(!error); 2392 ASSERT(!error);
2535 ASSERT(ap->alen); 2393 ASSERT(ap->length);
2536 } 2394 }
2537 nullfb = ap->firstblock == NULLFSBLOCK; 2395 nullfb = *ap->firstblock == NULLFSBLOCK;
2538 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2396 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
2539 if (nullfb) { 2397 if (nullfb) {
2540 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { 2398 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
2541 ag = xfs_filestream_lookup_ag(ap->ip); 2399 ag = xfs_filestream_lookup_ag(ap->ip);
2542 ag = (ag != NULLAGNUMBER) ? ag : 0; 2400 ag = (ag != NULLAGNUMBER) ? ag : 0;
2543 ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); 2401 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
2544 } else { 2402 } else {
2545 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); 2403 ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2546 } 2404 }
2547 } else 2405 } else
2548 ap->rval = ap->firstblock; 2406 ap->blkno = *ap->firstblock;
2549 2407
2550 xfs_bmap_adjacent(ap); 2408 xfs_bmap_adjacent(ap);
2551 2409
2552 /* 2410 /*
2553 * If allowed, use ap->rval; otherwise must use firstblock since 2411 * If allowed, use ap->blkno; otherwise must use firstblock since
2554 * it's in the right allocation group. 2412 * it's in the right allocation group.
2555 */ 2413 */
2556 if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) 2414 if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
2557 ; 2415 ;
2558 else 2416 else
2559 ap->rval = ap->firstblock; 2417 ap->blkno = *ap->firstblock;
2560 /* 2418 /*
2561 * Normal allocation, done through xfs_alloc_vextent. 2419 * Normal allocation, done through xfs_alloc_vextent.
2562 */ 2420 */
2563 tryagain = isaligned = 0; 2421 tryagain = isaligned = 0;
2564 args.tp = ap->tp; 2422 args.tp = ap->tp;
2565 args.mp = mp; 2423 args.mp = mp;
2566 args.fsbno = ap->rval; 2424 args.fsbno = ap->blkno;
2567 2425
2568 /* Trim the allocation back to the maximum an AG can fit. */ 2426 /* Trim the allocation back to the maximum an AG can fit. */
2569 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); 2427 args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
2570 args.firstblock = ap->firstblock; 2428 args.firstblock = *ap->firstblock;
2571 blen = 0; 2429 blen = 0;
2572 if (nullfb) { 2430 if (nullfb) {
2573 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); 2431 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2574 if (error) 2432 if (error)
2575 return error; 2433 return error;
2576 } else if (ap->low) { 2434 } else if (ap->flist->xbf_low) {
2577 if (xfs_inode_is_filestream(ap->ip)) 2435 if (xfs_inode_is_filestream(ap->ip))
2578 args.type = XFS_ALLOCTYPE_FIRST_AG; 2436 args.type = XFS_ALLOCTYPE_FIRST_AG;
2579 else 2437 else
@@ -2587,14 +2445,14 @@ xfs_bmap_btalloc(
2587 /* apply extent size hints if obtained earlier */ 2445 /* apply extent size hints if obtained earlier */
2588 if (unlikely(align)) { 2446 if (unlikely(align)) {
2589 args.prod = align; 2447 args.prod = align;
2590 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) 2448 if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
2591 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2449 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2592 } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { 2450 } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
2593 args.prod = 1; 2451 args.prod = 1;
2594 args.mod = 0; 2452 args.mod = 0;
2595 } else { 2453 } else {
2596 args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; 2454 args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
2597 if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) 2455 if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
2598 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2456 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2599 } 2457 }
2600 /* 2458 /*
@@ -2606,8 +2464,8 @@ xfs_bmap_btalloc(
2606 * is >= the stripe unit and the allocation offset is 2464 * is >= the stripe unit and the allocation offset is
2607 * at the end of file. 2465 * at the end of file.
2608 */ 2466 */
2609 if (!ap->low && ap->aeof) { 2467 if (!ap->flist->xbf_low && ap->aeof) {
2610 if (!ap->off) { 2468 if (!ap->offset) {
2611 args.alignment = mp->m_dalign; 2469 args.alignment = mp->m_dalign;
2612 atype = args.type; 2470 atype = args.type;
2613 isaligned = 1; 2471 isaligned = 1;
@@ -2660,7 +2518,7 @@ xfs_bmap_btalloc(
2660 * turned on. 2518 * turned on.
2661 */ 2519 */
2662 args.type = atype; 2520 args.type = atype;
2663 args.fsbno = ap->rval; 2521 args.fsbno = ap->blkno;
2664 args.alignment = mp->m_dalign; 2522 args.alignment = mp->m_dalign;
2665 args.minlen = nextminlen; 2523 args.minlen = nextminlen;
2666 args.minalignslop = 0; 2524 args.minalignslop = 0;
@@ -2674,7 +2532,7 @@ xfs_bmap_btalloc(
2674 * try again. 2532 * try again.
2675 */ 2533 */
2676 args.type = atype; 2534 args.type = atype;
2677 args.fsbno = ap->rval; 2535 args.fsbno = ap->blkno;
2678 args.alignment = 0; 2536 args.alignment = 0;
2679 if ((error = xfs_alloc_vextent(&args))) 2537 if ((error = xfs_alloc_vextent(&args)))
2680 return error; 2538 return error;
@@ -2683,7 +2541,7 @@ xfs_bmap_btalloc(
2683 args.minlen > ap->minlen) { 2541 args.minlen > ap->minlen) {
2684 args.minlen = ap->minlen; 2542 args.minlen = ap->minlen;
2685 args.type = XFS_ALLOCTYPE_START_BNO; 2543 args.type = XFS_ALLOCTYPE_START_BNO;
2686 args.fsbno = ap->rval; 2544 args.fsbno = ap->blkno;
2687 if ((error = xfs_alloc_vextent(&args))) 2545 if ((error = xfs_alloc_vextent(&args)))
2688 return error; 2546 return error;
2689 } 2547 }
@@ -2694,13 +2552,26 @@ xfs_bmap_btalloc(
2694 args.minleft = 0; 2552 args.minleft = 0;
2695 if ((error = xfs_alloc_vextent(&args))) 2553 if ((error = xfs_alloc_vextent(&args)))
2696 return error; 2554 return error;
2697 ap->low = 1; 2555 ap->flist->xbf_low = 1;
2698 } 2556 }
2699 if (args.fsbno != NULLFSBLOCK) { 2557 if (args.fsbno != NULLFSBLOCK) {
2700 ap->firstblock = ap->rval = args.fsbno; 2558 /*
2559 * check the allocation happened at the same or higher AG than
2560 * the first block that was allocated.
2561 */
2562 ASSERT(*ap->firstblock == NULLFSBLOCK ||
2563 XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
2564 XFS_FSB_TO_AGNO(mp, args.fsbno) ||
2565 (ap->flist->xbf_low &&
2566 XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
2567 XFS_FSB_TO_AGNO(mp, args.fsbno)));
2568
2569 ap->blkno = args.fsbno;
2570 if (*ap->firstblock == NULLFSBLOCK)
2571 *ap->firstblock = args.fsbno;
2701 ASSERT(nullfb || fb_agno == args.agno || 2572 ASSERT(nullfb || fb_agno == args.agno ||
2702 (ap->low && fb_agno < args.agno)); 2573 (ap->flist->xbf_low && fb_agno < args.agno));
2703 ap->alen = args.len; 2574 ap->length = args.len;
2704 ap->ip->i_d.di_nblocks += args.len; 2575 ap->ip->i_d.di_nblocks += args.len;
2705 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); 2576 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2706 if (ap->wasdel) 2577 if (ap->wasdel)
@@ -2714,8 +2585,8 @@ xfs_bmap_btalloc(
2714 XFS_TRANS_DQ_BCOUNT, 2585 XFS_TRANS_DQ_BCOUNT,
2715 (long) args.len); 2586 (long) args.len);
2716 } else { 2587 } else {
2717 ap->rval = NULLFSBLOCK; 2588 ap->blkno = NULLFSBLOCK;
2718 ap->alen = 0; 2589 ap->length = 0;
2719 } 2590 }
2720 return 0; 2591 return 0;
2721} 2592}
@@ -3589,7 +3460,7 @@ xfs_bmap_add_attrfork(
3589 } 3460 }
3590 ASSERT(ip->i_d.di_anextents == 0); 3461 ASSERT(ip->i_d.di_anextents == 0);
3591 3462
3592 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 3463 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3593 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 3464 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3594 3465
3595 switch (ip->i_d.di_format) { 3466 switch (ip->i_d.di_format) {
@@ -3782,19 +3653,11 @@ xfs_bmap_compute_maxlevels(
3782 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi 3653 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
3783 * caller. Frees all the extents that need freeing, which must be done 3654 * caller. Frees all the extents that need freeing, which must be done
3784 * last due to locking considerations. We never free any extents in 3655 * last due to locking considerations. We never free any extents in
3785 * the first transaction. This is to allow the caller to make the first 3656 * the first transaction.
3786 * transaction a synchronous one so that the pointers to the data being
3787 * broken in this transaction will be permanent before the data is actually
3788 * freed. This is necessary to prevent blocks from being reallocated
3789 * and written to before the free and reallocation are actually permanent.
3790 * We do not just make the first transaction synchronous here, because
3791 * there are more efficient ways to gain the same protection in some cases
3792 * (see the file truncation code).
3793 * 3657 *
3794 * Return 1 if the given transaction was committed and a new one 3658 * Return 1 if the given transaction was committed and a new one
3795 * started, and 0 otherwise in the committed parameter. 3659 * started, and 0 otherwise in the committed parameter.
3796 */ 3660 */
3797/*ARGSUSED*/
3798int /* error */ 3661int /* error */
3799xfs_bmap_finish( 3662xfs_bmap_finish(
3800 xfs_trans_t **tp, /* transaction pointer addr */ 3663 xfs_trans_t **tp, /* transaction pointer addr */
@@ -3994,42 +3857,122 @@ xfs_bmap_last_before(
3994 return 0; 3857 return 0;
3995} 3858}
3996 3859
3860STATIC int
3861xfs_bmap_last_extent(
3862 struct xfs_trans *tp,
3863 struct xfs_inode *ip,
3864 int whichfork,
3865 struct xfs_bmbt_irec *rec,
3866 int *is_empty)
3867{
3868 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
3869 int error;
3870 int nextents;
3871
3872 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
3873 error = xfs_iread_extents(tp, ip, whichfork);
3874 if (error)
3875 return error;
3876 }
3877
3878 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
3879 if (nextents == 0) {
3880 *is_empty = 1;
3881 return 0;
3882 }
3883
3884 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
3885 *is_empty = 0;
3886 return 0;
3887}
3888
3889/*
3890 * Check the last inode extent to determine whether this allocation will result
3891 * in blocks being allocated at the end of the file. When we allocate new data
3892 * blocks at the end of the file which do not start at the previous data block,
3893 * we will try to align the new blocks at stripe unit boundaries.
3894 *
3895 * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
3896 * at, or past the EOF.
3897 */
3898STATIC int
3899xfs_bmap_isaeof(
3900 struct xfs_bmalloca *bma,
3901 int whichfork)
3902{
3903 struct xfs_bmbt_irec rec;
3904 int is_empty;
3905 int error;
3906
3907 bma->aeof = 0;
3908 error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
3909 &is_empty);
3910 if (error || is_empty)
3911 return error;
3912
3913 /*
3914 * Check if we are allocation or past the last extent, or at least into
3915 * the last delayed allocated extent.
3916 */
3917 bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
3918 (bma->offset >= rec.br_startoff &&
3919 isnullstartblock(rec.br_startblock));
3920 return 0;
3921}
3922
3923/*
3924 * Check if the endoff is outside the last extent. If so the caller will grow
3925 * the allocation to a stripe unit boundary. All offsets are considered outside
3926 * the end of file for an empty fork, so 1 is returned in *eof in that case.
3927 */
3928int
3929xfs_bmap_eof(
3930 struct xfs_inode *ip,
3931 xfs_fileoff_t endoff,
3932 int whichfork,
3933 int *eof)
3934{
3935 struct xfs_bmbt_irec rec;
3936 int error;
3937
3938 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
3939 if (error || *eof)
3940 return error;
3941
3942 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
3943 return 0;
3944}
3945
3997/* 3946/*
3998 * Returns the file-relative block number of the first block past eof in 3947 * Returns the file-relative block number of the first block past eof in
3999 * the file. This is not based on i_size, it is based on the extent records. 3948 * the file. This is not based on i_size, it is based on the extent records.
4000 * Returns 0 for local files, as they do not have extent records. 3949 * Returns 0 for local files, as they do not have extent records.
4001 */ 3950 */
4002int /* error */ 3951int
4003xfs_bmap_last_offset( 3952xfs_bmap_last_offset(
4004 xfs_trans_t *tp, /* transaction pointer */ 3953 struct xfs_trans *tp,
4005 xfs_inode_t *ip, /* incore inode */ 3954 struct xfs_inode *ip,
4006 xfs_fileoff_t *last_block, /* last block */ 3955 xfs_fileoff_t *last_block,
4007 int whichfork) /* data or attr fork */ 3956 int whichfork)
4008{ 3957{
4009 xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ 3958 struct xfs_bmbt_irec rec;
4010 int error; /* error return value */ 3959 int is_empty;
4011 xfs_ifork_t *ifp; /* inode fork pointer */ 3960 int error;
4012 xfs_extnum_t nextents; /* number of extent entries */ 3961
3962 *last_block = 0;
3963
3964 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
3965 return 0;
4013 3966
4014 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 3967 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4015 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 3968 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4016 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
4017 return XFS_ERROR(EIO); 3969 return XFS_ERROR(EIO);
4018 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 3970
4019 *last_block = 0; 3971 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
4020 return 0; 3972 if (error || is_empty)
4021 }
4022 ifp = XFS_IFORK_PTR(ip, whichfork);
4023 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4024 (error = xfs_iread_extents(tp, ip, whichfork)))
4025 return error; 3973 return error;
4026 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3974
4027 if (!nextents) { 3975 *last_block = rec.br_startoff + rec.br_blockcount;
4028 *last_block = 0;
4029 return 0;
4030 }
4031 ep = xfs_iext_get_ext(ifp, nextents - 1);
4032 *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
4033 return 0; 3976 return 0;
4034} 3977}
4035 3978
@@ -4159,7 +4102,6 @@ xfs_bmap_read_extents(
4159 xfs_extnum_t num_recs; 4102 xfs_extnum_t num_recs;
4160 xfs_extnum_t start; 4103 xfs_extnum_t start;
4161 4104
4162
4163 num_recs = xfs_btree_get_numrecs(block); 4105 num_recs = xfs_btree_get_numrecs(block);
4164 if (unlikely(i + num_recs > room)) { 4106 if (unlikely(i + num_recs > room)) {
4165 ASSERT(i + num_recs <= room); 4107 ASSERT(i + num_recs <= room);
@@ -4282,9 +4224,8 @@ xfs_bmap_validate_ret(
4282 ASSERT(i == 0 || 4224 ASSERT(i == 0 ||
4283 mval[i - 1].br_startoff + mval[i - 1].br_blockcount == 4225 mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
4284 mval[i].br_startoff); 4226 mval[i].br_startoff);
4285 if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) 4227 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
4286 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && 4228 mval[i].br_startblock != HOLESTARTBLOCK);
4287 mval[i].br_startblock != HOLESTARTBLOCK);
4288 ASSERT(mval[i].br_state == XFS_EXT_NORM || 4229 ASSERT(mval[i].br_state == XFS_EXT_NORM ||
4289 mval[i].br_state == XFS_EXT_UNWRITTEN); 4230 mval[i].br_state == XFS_EXT_UNWRITTEN);
4290 } 4231 }
@@ -4293,66 +4234,609 @@ xfs_bmap_validate_ret(
4293 4234
4294 4235
4295/* 4236/*
4296 * Map file blocks to filesystem blocks. 4237 * Trim the returned map to the required bounds
4297 * File range is given by the bno/len pair. 4238 */
4298 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) 4239STATIC void
4299 * into a hole or past eof. 4240xfs_bmapi_trim_map(
4300 * Only allocates blocks from a single allocation group, 4241 struct xfs_bmbt_irec *mval,
4301 * to avoid locking problems. 4242 struct xfs_bmbt_irec *got,
4243 xfs_fileoff_t *bno,
4244 xfs_filblks_t len,
4245 xfs_fileoff_t obno,
4246 xfs_fileoff_t end,
4247 int n,
4248 int flags)
4249{
4250 if ((flags & XFS_BMAPI_ENTIRE) ||
4251 got->br_startoff + got->br_blockcount <= obno) {
4252 *mval = *got;
4253 if (isnullstartblock(got->br_startblock))
4254 mval->br_startblock = DELAYSTARTBLOCK;
4255 return;
4256 }
4257
4258 if (obno > *bno)
4259 *bno = obno;
4260 ASSERT((*bno >= obno) || (n == 0));
4261 ASSERT(*bno < end);
4262 mval->br_startoff = *bno;
4263 if (isnullstartblock(got->br_startblock))
4264 mval->br_startblock = DELAYSTARTBLOCK;
4265 else
4266 mval->br_startblock = got->br_startblock +
4267 (*bno - got->br_startoff);
4268 /*
4269 * Return the minimum of what we got and what we asked for for
4270 * the length. We can use the len variable here because it is
4271 * modified below and we could have been there before coming
4272 * here if the first part of the allocation didn't overlap what
4273 * was asked for.
4274 */
4275 mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
4276 got->br_blockcount - (*bno - got->br_startoff));
4277 mval->br_state = got->br_state;
4278 ASSERT(mval->br_blockcount <= len);
4279 return;
4280}
4281
4282/*
4283 * Update and validate the extent map to return
4284 */
4285STATIC void
4286xfs_bmapi_update_map(
4287 struct xfs_bmbt_irec **map,
4288 xfs_fileoff_t *bno,
4289 xfs_filblks_t *len,
4290 xfs_fileoff_t obno,
4291 xfs_fileoff_t end,
4292 int *n,
4293 int flags)
4294{
4295 xfs_bmbt_irec_t *mval = *map;
4296
4297 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4298 ((mval->br_startoff + mval->br_blockcount) <= end));
4299 ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
4300 (mval->br_startoff < obno));
4301
4302 *bno = mval->br_startoff + mval->br_blockcount;
4303 *len = end - *bno;
4304 if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
4305 /* update previous map with new information */
4306 ASSERT(mval->br_startblock == mval[-1].br_startblock);
4307 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
4308 ASSERT(mval->br_state == mval[-1].br_state);
4309 mval[-1].br_blockcount = mval->br_blockcount;
4310 mval[-1].br_state = mval->br_state;
4311 } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
4312 mval[-1].br_startblock != DELAYSTARTBLOCK &&
4313 mval[-1].br_startblock != HOLESTARTBLOCK &&
4314 mval->br_startblock == mval[-1].br_startblock +
4315 mval[-1].br_blockcount &&
4316 ((flags & XFS_BMAPI_IGSTATE) ||
4317 mval[-1].br_state == mval->br_state)) {
4318 ASSERT(mval->br_startoff ==
4319 mval[-1].br_startoff + mval[-1].br_blockcount);
4320 mval[-1].br_blockcount += mval->br_blockcount;
4321 } else if (*n > 0 &&
4322 mval->br_startblock == DELAYSTARTBLOCK &&
4323 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4324 mval->br_startoff ==
4325 mval[-1].br_startoff + mval[-1].br_blockcount) {
4326 mval[-1].br_blockcount += mval->br_blockcount;
4327 mval[-1].br_state = mval->br_state;
4328 } else if (!((*n == 0) &&
4329 ((mval->br_startoff + mval->br_blockcount) <=
4330 obno))) {
4331 mval++;
4332 (*n)++;
4333 }
4334 *map = mval;
4335}
4336
4337/*
4338 * Map file blocks to filesystem blocks without allocation.
4339 */
4340int
4341xfs_bmapi_read(
4342 struct xfs_inode *ip,
4343 xfs_fileoff_t bno,
4344 xfs_filblks_t len,
4345 struct xfs_bmbt_irec *mval,
4346 int *nmap,
4347 int flags)
4348{
4349 struct xfs_mount *mp = ip->i_mount;
4350 struct xfs_ifork *ifp;
4351 struct xfs_bmbt_irec got;
4352 struct xfs_bmbt_irec prev;
4353 xfs_fileoff_t obno;
4354 xfs_fileoff_t end;
4355 xfs_extnum_t lastx;
4356 int error;
4357 int eof;
4358 int n = 0;
4359 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4360 XFS_ATTR_FORK : XFS_DATA_FORK;
4361
4362 ASSERT(*nmap >= 1);
4363 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
4364 XFS_BMAPI_IGSTATE)));
4365
4366 if (unlikely(XFS_TEST_ERROR(
4367 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4368 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4369 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4370 XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
4371 return XFS_ERROR(EFSCORRUPTED);
4372 }
4373
4374 if (XFS_FORCED_SHUTDOWN(mp))
4375 return XFS_ERROR(EIO);
4376
4377 XFS_STATS_INC(xs_blk_mapr);
4378
4379 ifp = XFS_IFORK_PTR(ip, whichfork);
4380 ASSERT(ifp->if_ext_max ==
4381 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4382
4383 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4384 error = xfs_iread_extents(NULL, ip, whichfork);
4385 if (error)
4386 return error;
4387 }
4388
4389 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
4390 end = bno + len;
4391 obno = bno;
4392
4393 while (bno < end && n < *nmap) {
4394 /* Reading past eof, act as though there's a hole up to end. */
4395 if (eof)
4396 got.br_startoff = end;
4397 if (got.br_startoff > bno) {
4398 /* Reading in a hole. */
4399 mval->br_startoff = bno;
4400 mval->br_startblock = HOLESTARTBLOCK;
4401 mval->br_blockcount =
4402 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4403 mval->br_state = XFS_EXT_NORM;
4404 bno += mval->br_blockcount;
4405 len -= mval->br_blockcount;
4406 mval++;
4407 n++;
4408 continue;
4409 }
4410
4411 /* set up the extent map to return. */
4412 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4413 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4414
4415 /* If we're done, stop now. */
4416 if (bno >= end || n >= *nmap)
4417 break;
4418
4419 /* Else go on to the next record. */
4420 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4421 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4422 else
4423 eof = 1;
4424 }
4425 *nmap = n;
4426 return 0;
4427}
4428
4429STATIC int
4430xfs_bmapi_reserve_delalloc(
4431 struct xfs_inode *ip,
4432 xfs_fileoff_t aoff,
4433 xfs_filblks_t len,
4434 struct xfs_bmbt_irec *got,
4435 struct xfs_bmbt_irec *prev,
4436 xfs_extnum_t *lastx,
4437 int eof)
4438{
4439 struct xfs_mount *mp = ip->i_mount;
4440 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4441 xfs_extlen_t alen;
4442 xfs_extlen_t indlen;
4443 char rt = XFS_IS_REALTIME_INODE(ip);
4444 xfs_extlen_t extsz;
4445 int error;
4446
4447 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
4448 if (!eof)
4449 alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
4450
4451 /* Figure out the extent size, adjust alen */
4452 extsz = xfs_get_extsz_hint(ip);
4453 if (extsz) {
4454 /*
4455 * Make sure we don't exceed a single extent length when we
4456 * align the extent by reducing length we are going to
4457 * allocate by the maximum amount extent size aligment may
4458 * require.
4459 */
4460 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
4461 error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
4462 1, 0, &aoff, &alen);
4463 ASSERT(!error);
4464 }
4465
4466 if (rt)
4467 extsz = alen / mp->m_sb.sb_rextsize;
4468
4469 /*
4470 * Make a transaction-less quota reservation for delayed allocation
4471 * blocks. This number gets adjusted later. We return if we haven't
4472 * allocated blocks already inside this loop.
4473 */
4474 error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
4475 rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4476 if (error)
4477 return error;
4478
4479 /*
4480 * Split changing sb for alen and indlen since they could be coming
4481 * from different places.
4482 */
4483 indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
4484 ASSERT(indlen > 0);
4485
4486 if (rt) {
4487 error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
4488 -((int64_t)extsz), 0);
4489 } else {
4490 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4491 -((int64_t)alen), 0);
4492 }
4493
4494 if (error)
4495 goto out_unreserve_quota;
4496
4497 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4498 -((int64_t)indlen), 0);
4499 if (error)
4500 goto out_unreserve_blocks;
4501
4502
4503 ip->i_delayed_blks += alen;
4504
4505 got->br_startoff = aoff;
4506 got->br_startblock = nullstartblock(indlen);
4507 got->br_blockcount = alen;
4508 got->br_state = XFS_EXT_NORM;
4509 xfs_bmap_add_extent_hole_delay(ip, lastx, got);
4510
4511 /*
4512 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
4513 * might have merged it into one of the neighbouring ones.
4514 */
4515 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
4516
4517 ASSERT(got->br_startoff <= aoff);
4518 ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
4519 ASSERT(isnullstartblock(got->br_startblock));
4520 ASSERT(got->br_state == XFS_EXT_NORM);
4521 return 0;
4522
4523out_unreserve_blocks:
4524 if (rt)
4525 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
4526 else
4527 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
4528out_unreserve_quota:
4529 if (XFS_IS_QUOTA_ON(mp))
4530 xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
4531 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4532 return error;
4533}
4534
4535/*
4536 * Map file blocks to filesystem blocks, adding delayed allocations as needed.
4537 */
4538int
4539xfs_bmapi_delay(
4540 struct xfs_inode *ip, /* incore inode */
4541 xfs_fileoff_t bno, /* starting file offs. mapped */
4542 xfs_filblks_t len, /* length to map in file */
4543 struct xfs_bmbt_irec *mval, /* output: map values */
4544 int *nmap, /* i/o: mval size/count */
4545 int flags) /* XFS_BMAPI_... */
4546{
4547 struct xfs_mount *mp = ip->i_mount;
4548 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4549 struct xfs_bmbt_irec got; /* current file extent record */
4550 struct xfs_bmbt_irec prev; /* previous file extent record */
4551 xfs_fileoff_t obno; /* old block number (offset) */
4552 xfs_fileoff_t end; /* end of mapped file region */
4553 xfs_extnum_t lastx; /* last useful extent number */
4554 int eof; /* we've hit the end of extents */
4555 int n = 0; /* current extent index */
4556 int error = 0;
4557
4558 ASSERT(*nmap >= 1);
4559 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4560 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4561
4562 if (unlikely(XFS_TEST_ERROR(
4563 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
4564 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
4565 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4566 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
4567 return XFS_ERROR(EFSCORRUPTED);
4568 }
4569
4570 if (XFS_FORCED_SHUTDOWN(mp))
4571 return XFS_ERROR(EIO);
4572
4573 XFS_STATS_INC(xs_blk_mapw);
4574
4575 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4576 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
4577 if (error)
4578 return error;
4579 }
4580
4581 xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
4582 end = bno + len;
4583 obno = bno;
4584
4585 while (bno < end && n < *nmap) {
4586 if (eof || got.br_startoff > bno) {
4587 error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
4588 &prev, &lastx, eof);
4589 if (error) {
4590 if (n == 0) {
4591 *nmap = 0;
4592 return error;
4593 }
4594 break;
4595 }
4596 }
4597
4598 /* set up the extent map to return. */
4599 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4600 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4601
4602 /* If we're done, stop now. */
4603 if (bno >= end || n >= *nmap)
4604 break;
4605
4606 /* Else go on to the next record. */
4607 prev = got;
4608 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4609 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4610 else
4611 eof = 1;
4612 }
4613
4614 *nmap = n;
4615 return 0;
4616}
4617
4618
4619STATIC int
4620xfs_bmapi_allocate(
4621 struct xfs_bmalloca *bma,
4622 int flags)
4623{
4624 struct xfs_mount *mp = bma->ip->i_mount;
4625 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4626 XFS_ATTR_FORK : XFS_DATA_FORK;
4627 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4628 int tmp_logflags = 0;
4629 int error;
4630 int rt;
4631
4632 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4633
4634 /*
4635 * For the wasdelay case, we could also just allocate the stuff asked
4636 * for in this bmap call but that wouldn't be as good.
4637 */
4638 if (bma->wasdel) {
4639 bma->length = (xfs_extlen_t)bma->got.br_blockcount;
4640 bma->offset = bma->got.br_startoff;
4641 if (bma->idx != NULLEXTNUM && bma->idx) {
4642 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
4643 &bma->prev);
4644 }
4645 } else {
4646 bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
4647 if (!bma->eof)
4648 bma->length = XFS_FILBLKS_MIN(bma->length,
4649 bma->got.br_startoff - bma->offset);
4650 }
4651
4652 /*
4653 * Indicate if this is the first user data in the file, or just any
4654 * user data.
4655 */
4656 if (!(flags & XFS_BMAPI_METADATA)) {
4657 bma->userdata = (bma->offset == 0) ?
4658 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
4659 }
4660
4661 bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
4662
4663 /*
4664 * Only want to do the alignment at the eof if it is userdata and
4665 * allocation length is larger than a stripe unit.
4666 */
4667 if (mp->m_dalign && bma->length >= mp->m_dalign &&
4668 !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
4669 error = xfs_bmap_isaeof(bma, whichfork);
4670 if (error)
4671 return error;
4672 }
4673
4674 error = xfs_bmap_alloc(bma);
4675 if (error)
4676 return error;
4677
4678 if (bma->flist->xbf_low)
4679 bma->minleft = 0;
4680 if (bma->cur)
4681 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4682 if (bma->blkno == NULLFSBLOCK)
4683 return 0;
4684 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4685 bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
4686 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4687 bma->cur->bc_private.b.flist = bma->flist;
4688 }
4689 /*
4690 * Bump the number of extents we've allocated
4691 * in this call.
4692 */
4693 bma->nallocs++;
4694
4695 if (bma->cur)
4696 bma->cur->bc_private.b.flags =
4697 bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
4698
4699 bma->got.br_startoff = bma->offset;
4700 bma->got.br_startblock = bma->blkno;
4701 bma->got.br_blockcount = bma->length;
4702 bma->got.br_state = XFS_EXT_NORM;
4703
4704 /*
4705 * A wasdelay extent has been initialized, so shouldn't be flagged
4706 * as unwritten.
4707 */
4708 if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
4709 xfs_sb_version_hasextflgbit(&mp->m_sb))
4710 bma->got.br_state = XFS_EXT_UNWRITTEN;
4711
4712 if (bma->wasdel)
4713 error = xfs_bmap_add_extent_delay_real(bma);
4714 else
4715 error = xfs_bmap_add_extent_hole_real(bma, whichfork);
4716
4717 bma->logflags |= tmp_logflags;
4718 if (error)
4719 return error;
4720
4721 /*
4722 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
4723 * or xfs_bmap_add_extent_hole_real might have merged it into one of
4724 * the neighbouring ones.
4725 */
4726 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4727
4728 ASSERT(bma->got.br_startoff <= bma->offset);
4729 ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
4730 bma->offset + bma->length);
4731 ASSERT(bma->got.br_state == XFS_EXT_NORM ||
4732 bma->got.br_state == XFS_EXT_UNWRITTEN);
4733 return 0;
4734}
4735
4736STATIC int
4737xfs_bmapi_convert_unwritten(
4738 struct xfs_bmalloca *bma,
4739 struct xfs_bmbt_irec *mval,
4740 xfs_filblks_t len,
4741 int flags)
4742{
4743 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4744 XFS_ATTR_FORK : XFS_DATA_FORK;
4745 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4746 int tmp_logflags = 0;
4747 int error;
4748
4749 /* check if we need to do unwritten->real conversion */
4750 if (mval->br_state == XFS_EXT_UNWRITTEN &&
4751 (flags & XFS_BMAPI_PREALLOC))
4752 return 0;
4753
4754 /* check if we need to do real->unwritten conversion */
4755 if (mval->br_state == XFS_EXT_NORM &&
4756 (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
4757 (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
4758 return 0;
4759
4760 /*
4761 * Modify (by adding) the state flag, if writing.
4762 */
4763 ASSERT(mval->br_blockcount <= len);
4764 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4765 bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
4766 bma->ip, whichfork);
4767 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4768 bma->cur->bc_private.b.flist = bma->flist;
4769 }
4770 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4771 ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
4772
4773 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
4774 &bma->cur, mval, bma->firstblock, bma->flist,
4775 &tmp_logflags);
4776 bma->logflags |= tmp_logflags;
4777 if (error)
4778 return error;
4779
4780 /*
4781 * Update our extent pointer, given that
4782 * xfs_bmap_add_extent_unwritten_real might have merged it into one
4783 * of the neighbouring ones.
4784 */
4785 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4786
4787 /*
4788 * We may have combined previously unwritten space with written space,
4789 * so generate another request.
4790 */
4791 if (mval->br_blockcount < len)
4792 return EAGAIN;
4793 return 0;
4794}
4795
4796/*
4797 * Map file blocks to filesystem blocks, and allocate blocks or convert the
4798 * extent state if necessary. Details behaviour is controlled by the flags
4799 * parameter. Only allocates blocks from a single allocation group, to avoid
4800 * locking problems.
4801 *
4302 * The returned value in "firstblock" from the first call in a transaction 4802 * The returned value in "firstblock" from the first call in a transaction
4303 * must be remembered and presented to subsequent calls in "firstblock". 4803 * must be remembered and presented to subsequent calls in "firstblock".
4304 * An upper bound for the number of blocks to be allocated is supplied to 4804 * An upper bound for the number of blocks to be allocated is supplied to
4305 * the first call in "total"; if no allocation group has that many free 4805 * the first call in "total"; if no allocation group has that many free
4306 * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). 4806 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
4307 */ 4807 */
4308int /* error */ 4808int
4309xfs_bmapi( 4809xfs_bmapi_write(
4310 xfs_trans_t *tp, /* transaction pointer */ 4810 struct xfs_trans *tp, /* transaction pointer */
4311 xfs_inode_t *ip, /* incore inode */ 4811 struct xfs_inode *ip, /* incore inode */
4312 xfs_fileoff_t bno, /* starting file offs. mapped */ 4812 xfs_fileoff_t bno, /* starting file offs. mapped */
4313 xfs_filblks_t len, /* length to map in file */ 4813 xfs_filblks_t len, /* length to map in file */
4314 int flags, /* XFS_BMAPI_... */ 4814 int flags, /* XFS_BMAPI_... */
4315 xfs_fsblock_t *firstblock, /* first allocated block 4815 xfs_fsblock_t *firstblock, /* first allocated block
4316 controls a.g. for allocs */ 4816 controls a.g. for allocs */
4317 xfs_extlen_t total, /* total blocks needed */ 4817 xfs_extlen_t total, /* total blocks needed */
4318 xfs_bmbt_irec_t *mval, /* output: map values */ 4818 struct xfs_bmbt_irec *mval, /* output: map values */
4319 int *nmap, /* i/o: mval size/count */ 4819 int *nmap, /* i/o: mval size/count */
4320 xfs_bmap_free_t *flist) /* i/o: list extents to free */ 4820 struct xfs_bmap_free *flist) /* i/o: list extents to free */
4321{ 4821{
4322 xfs_fsblock_t abno; /* allocated block number */ 4822 struct xfs_mount *mp = ip->i_mount;
4323 xfs_extlen_t alen; /* allocated extent length */ 4823 struct xfs_ifork *ifp;
4324 xfs_fileoff_t aoff; /* allocated file offset */ 4824 struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */
4325 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ 4825 xfs_fileoff_t end; /* end of mapped file region */
4326 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4826 int eof; /* after the end of extents */
4327 xfs_fileoff_t end; /* end of mapped file region */ 4827 int error; /* error return */
4328 int eof; /* we've hit the end of extents */ 4828 int n; /* current extent index */
4329 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 4829 xfs_fileoff_t obno; /* old block number (offset) */
4330 int error; /* error return */ 4830 int whichfork; /* data or attr fork */
4331 xfs_bmbt_irec_t got; /* current file extent record */ 4831 char inhole; /* current location is hole in file */
4332 xfs_ifork_t *ifp; /* inode fork pointer */ 4832 char wasdelay; /* old extent was delayed */
4333 xfs_extlen_t indlen; /* indirect blocks length */ 4833
4334 xfs_extnum_t lastx; /* last useful extent number */
4335 int logflags; /* flags for transaction logging */
4336 xfs_extlen_t minleft; /* min blocks left after allocation */
4337 xfs_extlen_t minlen; /* min allocation size */
4338 xfs_mount_t *mp; /* xfs mount structure */
4339 int n; /* current extent index */
4340 int nallocs; /* number of extents alloc'd */
4341 xfs_extnum_t nextents; /* number of extents in file */
4342 xfs_fileoff_t obno; /* old block number (offset) */
4343 xfs_bmbt_irec_t prev; /* previous file extent record */
4344 int tmp_logflags; /* temp flags holder */
4345 int whichfork; /* data or attr fork */
4346 char inhole; /* current location is hole in file */
4347 char wasdelay; /* old extent was delayed */
4348 char wr; /* this is a write request */
4349 char rt; /* this is a realtime file */
4350#ifdef DEBUG 4834#ifdef DEBUG
4351 xfs_fileoff_t orig_bno; /* original block number value */ 4835 xfs_fileoff_t orig_bno; /* original block number value */
4352 int orig_flags; /* original flags arg value */ 4836 int orig_flags; /* original flags arg value */
4353 xfs_filblks_t orig_len; /* original value of len arg */ 4837 xfs_filblks_t orig_len; /* original value of len arg */
4354 xfs_bmbt_irec_t *orig_mval; /* original value of mval */ 4838 struct xfs_bmbt_irec *orig_mval; /* original value of mval */
4355 int orig_nmap; /* original value of *nmap */ 4839 int orig_nmap; /* original value of *nmap */
4356 4840
4357 orig_bno = bno; 4841 orig_bno = bno;
4358 orig_len = len; 4842 orig_len = len;
@@ -4360,488 +4844,133 @@ xfs_bmapi(
4360 orig_mval = mval; 4844 orig_mval = mval;
4361 orig_nmap = *nmap; 4845 orig_nmap = *nmap;
4362#endif 4846#endif
4847
4363 ASSERT(*nmap >= 1); 4848 ASSERT(*nmap >= 1);
4364 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); 4849 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4850 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4851 ASSERT(tp != NULL);
4852
4365 whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4853 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4366 XFS_ATTR_FORK : XFS_DATA_FORK; 4854 XFS_ATTR_FORK : XFS_DATA_FORK;
4367 mp = ip->i_mount; 4855
4368 if (unlikely(XFS_TEST_ERROR( 4856 if (unlikely(XFS_TEST_ERROR(
4369 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4857 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4370 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 4858 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4371 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), 4859 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
4372 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4860 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4373 XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); 4861 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4374 return XFS_ERROR(EFSCORRUPTED); 4862 return XFS_ERROR(EFSCORRUPTED);
4375 } 4863 }
4864
4376 if (XFS_FORCED_SHUTDOWN(mp)) 4865 if (XFS_FORCED_SHUTDOWN(mp))
4377 return XFS_ERROR(EIO); 4866 return XFS_ERROR(EIO);
4378 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); 4867
4379 ifp = XFS_IFORK_PTR(ip, whichfork); 4868 ifp = XFS_IFORK_PTR(ip, whichfork);
4380 ASSERT(ifp->if_ext_max == 4869 ASSERT(ifp->if_ext_max ==
4381 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 4870 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4382 if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) 4871
4383 XFS_STATS_INC(xs_blk_mapw); 4872 XFS_STATS_INC(xs_blk_mapw);
4384 else 4873
4385 XFS_STATS_INC(xs_blk_mapr);
4386 /*
4387 * IGSTATE flag is used to combine extents which
4388 * differ only due to the state of the extents.
4389 * This technique is used from xfs_getbmap()
4390 * when the caller does not wish to see the
4391 * separation (which is the default).
4392 *
4393 * This technique is also used when writing a
4394 * buffer which has been partially written,
4395 * (usually by being flushed during a chunkread),
4396 * to ensure one write takes place. This also
4397 * prevents a change in the xfs inode extents at
4398 * this time, intentionally. This change occurs
4399 * on completion of the write operation, in
4400 * xfs_strat_comp(), where the xfs_bmapi() call
4401 * is transactioned, and the extents combined.
4402 */
4403 if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */
4404 wr = 0; /* no allocations are allowed */
4405 ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
4406 logflags = 0;
4407 nallocs = 0;
4408 cur = NULL;
4409 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 4874 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4410 ASSERT(wr && tp); 4875 error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
4411 if ((error = xfs_bmap_local_to_extents(tp, ip, 4876 &bma.logflags, whichfork);
4412 firstblock, total, &logflags, whichfork))) 4877 if (error)
4413 goto error0; 4878 goto error0;
4414 } 4879 }
4415 if (wr && *firstblock == NULLFSBLOCK) { 4880
4881 if (*firstblock == NULLFSBLOCK) {
4416 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) 4882 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4417 minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; 4883 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
4418 else 4884 else
4419 minleft = 1; 4885 bma.minleft = 1;
4420 } else 4886 } else {
4421 minleft = 0; 4887 bma.minleft = 0;
4422 if (!(ifp->if_flags & XFS_IFEXTENTS) && 4888 }
4423 (error = xfs_iread_extents(tp, ip, whichfork))) 4889
4424 goto error0; 4890 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4425 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, 4891 error = xfs_iread_extents(tp, ip, whichfork);
4426 &prev); 4892 if (error)
4427 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4893 goto error0;
4894 }
4895
4896 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
4897 &bma.prev);
4428 n = 0; 4898 n = 0;
4429 end = bno + len; 4899 end = bno + len;
4430 obno = bno; 4900 obno = bno;
4431 bma.ip = NULL; 4901
4902 bma.tp = tp;
4903 bma.ip = ip;
4904 bma.total = total;
4905 bma.userdata = 0;
4906 bma.flist = flist;
4907 bma.firstblock = firstblock;
4432 4908
4433 while (bno < end && n < *nmap) { 4909 while (bno < end && n < *nmap) {
4434 /* 4910 inhole = eof || bma.got.br_startoff > bno;
4435 * Reading past eof, act as though there's a hole 4911 wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
4436 * up to end. 4912
4437 */
4438 if (eof && !wr)
4439 got.br_startoff = end;
4440 inhole = eof || got.br_startoff > bno;
4441 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
4442 isnullstartblock(got.br_startblock);
4443 /* 4913 /*
4444 * First, deal with the hole before the allocated space 4914 * First, deal with the hole before the allocated space
4445 * that we found, if any. 4915 * that we found, if any.
4446 */ 4916 */
4447 if (wr && (inhole || wasdelay)) { 4917 if (inhole || wasdelay) {
4448 /* 4918 bma.eof = eof;
4449 * For the wasdelay case, we could also just 4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4450 * allocate the stuff asked for in this bmap call 4920 bma.wasdel = wasdelay;
4451 * but that wouldn't be as good. 4921 bma.length = len;
4452 */ 4922 bma.offset = bno;
4453 if (wasdelay) { 4923
4454 alen = (xfs_extlen_t)got.br_blockcount; 4924 error = xfs_bmapi_allocate(&bma, flags);
4455 aoff = got.br_startoff;
4456 if (lastx != NULLEXTNUM && lastx) {
4457 ep = xfs_iext_get_ext(ifp, lastx - 1);
4458 xfs_bmbt_get_all(ep, &prev);
4459 }
4460 } else {
4461 alen = (xfs_extlen_t)
4462 XFS_FILBLKS_MIN(len, MAXEXTLEN);
4463 if (!eof)
4464 alen = (xfs_extlen_t)
4465 XFS_FILBLKS_MIN(alen,
4466 got.br_startoff - bno);
4467 aoff = bno;
4468 }
4469 minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
4470 if (flags & XFS_BMAPI_DELAY) {
4471 xfs_extlen_t extsz;
4472
4473 /* Figure out the extent size, adjust alen */
4474 extsz = xfs_get_extsz_hint(ip);
4475 if (extsz) {
4476 /*
4477 * make sure we don't exceed a single
4478 * extent length when we align the
4479 * extent by reducing length we are
4480 * going to allocate by the maximum
4481 * amount extent size aligment may
4482 * require.
4483 */
4484 alen = XFS_FILBLKS_MIN(len,
4485 MAXEXTLEN - (2 * extsz - 1));
4486 error = xfs_bmap_extsize_align(mp,
4487 &got, &prev, extsz,
4488 rt, eof,
4489 flags&XFS_BMAPI_DELAY,
4490 flags&XFS_BMAPI_CONVERT,
4491 &aoff, &alen);
4492 ASSERT(!error);
4493 }
4494
4495 if (rt)
4496 extsz = alen / mp->m_sb.sb_rextsize;
4497
4498 /*
4499 * Make a transaction-less quota reservation for
4500 * delayed allocation blocks. This number gets
4501 * adjusted later. We return if we haven't
4502 * allocated blocks already inside this loop.
4503 */
4504 error = xfs_trans_reserve_quota_nblks(
4505 NULL, ip, (long)alen, 0,
4506 rt ? XFS_QMOPT_RES_RTBLKS :
4507 XFS_QMOPT_RES_REGBLKS);
4508 if (error) {
4509 if (n == 0) {
4510 *nmap = 0;
4511 ASSERT(cur == NULL);
4512 return error;
4513 }
4514 break;
4515 }
4516
4517 /*
4518 * Split changing sb for alen and indlen since
4519 * they could be coming from different places.
4520 */
4521 indlen = (xfs_extlen_t)
4522 xfs_bmap_worst_indlen(ip, alen);
4523 ASSERT(indlen > 0);
4524
4525 if (rt) {
4526 error = xfs_mod_incore_sb(mp,
4527 XFS_SBS_FREXTENTS,
4528 -((int64_t)extsz), 0);
4529 } else {
4530 error = xfs_icsb_modify_counters(mp,
4531 XFS_SBS_FDBLOCKS,
4532 -((int64_t)alen), 0);
4533 }
4534 if (!error) {
4535 error = xfs_icsb_modify_counters(mp,
4536 XFS_SBS_FDBLOCKS,
4537 -((int64_t)indlen), 0);
4538 if (error && rt)
4539 xfs_mod_incore_sb(mp,
4540 XFS_SBS_FREXTENTS,
4541 (int64_t)extsz, 0);
4542 else if (error)
4543 xfs_icsb_modify_counters(mp,
4544 XFS_SBS_FDBLOCKS,
4545 (int64_t)alen, 0);
4546 }
4547
4548 if (error) {
4549 if (XFS_IS_QUOTA_ON(mp))
4550 /* unreserve the blocks now */
4551 (void)
4552 xfs_trans_unreserve_quota_nblks(
4553 NULL, ip,
4554 (long)alen, 0, rt ?
4555 XFS_QMOPT_RES_RTBLKS :
4556 XFS_QMOPT_RES_REGBLKS);
4557 break;
4558 }
4559
4560 ip->i_delayed_blks += alen;
4561 abno = nullstartblock(indlen);
4562 } else {
4563 /*
4564 * If first time, allocate and fill in
4565 * once-only bma fields.
4566 */
4567 if (bma.ip == NULL) {
4568 bma.tp = tp;
4569 bma.ip = ip;
4570 bma.prevp = &prev;
4571 bma.gotp = &got;
4572 bma.total = total;
4573 bma.userdata = 0;
4574 }
4575 /* Indicate if this is the first user data
4576 * in the file, or just any user data.
4577 */
4578 if (!(flags & XFS_BMAPI_METADATA)) {
4579 bma.userdata = (aoff == 0) ?
4580 XFS_ALLOC_INITIAL_USER_DATA :
4581 XFS_ALLOC_USERDATA;
4582 }
4583 /*
4584 * Fill in changeable bma fields.
4585 */
4586 bma.eof = eof;
4587 bma.firstblock = *firstblock;
4588 bma.alen = alen;
4589 bma.off = aoff;
4590 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4591 bma.wasdel = wasdelay;
4592 bma.minlen = minlen;
4593 bma.low = flist->xbf_low;
4594 bma.minleft = minleft;
4595 /*
4596 * Only want to do the alignment at the
4597 * eof if it is userdata and allocation length
4598 * is larger than a stripe unit.
4599 */
4600 if (mp->m_dalign && alen >= mp->m_dalign &&
4601 (!(flags & XFS_BMAPI_METADATA)) &&
4602 (whichfork == XFS_DATA_FORK)) {
4603 if ((error = xfs_bmap_isaeof(ip, aoff,
4604 whichfork, &bma.aeof)))
4605 goto error0;
4606 } else
4607 bma.aeof = 0;
4608 /*
4609 * Call allocator.
4610 */
4611 if ((error = xfs_bmap_alloc(&bma)))
4612 goto error0;
4613 /*
4614 * Copy out result fields.
4615 */
4616 abno = bma.rval;
4617 if ((flist->xbf_low = bma.low))
4618 minleft = 0;
4619 alen = bma.alen;
4620 aoff = bma.off;
4621 ASSERT(*firstblock == NULLFSBLOCK ||
4622 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4623 XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
4624 (flist->xbf_low &&
4625 XFS_FSB_TO_AGNO(mp, *firstblock) <
4626 XFS_FSB_TO_AGNO(mp, bma.firstblock)));
4627 *firstblock = bma.firstblock;
4628 if (cur)
4629 cur->bc_private.b.firstblock =
4630 *firstblock;
4631 if (abno == NULLFSBLOCK)
4632 break;
4633 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4634 cur = xfs_bmbt_init_cursor(mp, tp,
4635 ip, whichfork);
4636 cur->bc_private.b.firstblock =
4637 *firstblock;
4638 cur->bc_private.b.flist = flist;
4639 }
4640 /*
4641 * Bump the number of extents we've allocated
4642 * in this call.
4643 */
4644 nallocs++;
4645 }
4646 if (cur)
4647 cur->bc_private.b.flags =
4648 wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
4649 got.br_startoff = aoff;
4650 got.br_startblock = abno;
4651 got.br_blockcount = alen;
4652 got.br_state = XFS_EXT_NORM; /* assume normal */
4653 /*
4654 * Determine state of extent, and the filesystem.
4655 * A wasdelay extent has been initialized, so
4656 * shouldn't be flagged as unwritten.
4657 */
4658 if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
4659 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4660 got.br_state = XFS_EXT_UNWRITTEN;
4661 }
4662 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
4663 firstblock, flist, &tmp_logflags,
4664 whichfork);
4665 logflags |= tmp_logflags;
4666 if (error) 4925 if (error)
4667 goto error0; 4926 goto error0;
4668 ep = xfs_iext_get_ext(ifp, lastx); 4927 if (bma.blkno == NULLFSBLOCK)
4669 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4928 break;
4670 xfs_bmbt_get_all(ep, &got);
4671 ASSERT(got.br_startoff <= aoff);
4672 ASSERT(got.br_startoff + got.br_blockcount >=
4673 aoff + alen);
4674#ifdef DEBUG
4675 if (flags & XFS_BMAPI_DELAY) {
4676 ASSERT(isnullstartblock(got.br_startblock));
4677 ASSERT(startblockval(got.br_startblock) > 0);
4678 }
4679 ASSERT(got.br_state == XFS_EXT_NORM ||
4680 got.br_state == XFS_EXT_UNWRITTEN);
4681#endif
4682 /*
4683 * Fall down into the found allocated space case.
4684 */
4685 } else if (inhole) {
4686 /*
4687 * Reading in a hole.
4688 */
4689 mval->br_startoff = bno;
4690 mval->br_startblock = HOLESTARTBLOCK;
4691 mval->br_blockcount =
4692 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4693 mval->br_state = XFS_EXT_NORM;
4694 bno += mval->br_blockcount;
4695 len -= mval->br_blockcount;
4696 mval++;
4697 n++;
4698 continue;
4699 }
4700 /*
4701 * Then deal with the allocated space we found.
4702 */
4703 ASSERT(ep != NULL);
4704 if (!(flags & XFS_BMAPI_ENTIRE) &&
4705 (got.br_startoff + got.br_blockcount > obno)) {
4706 if (obno > bno)
4707 bno = obno;
4708 ASSERT((bno >= obno) || (n == 0));
4709 ASSERT(bno < end);
4710 mval->br_startoff = bno;
4711 if (isnullstartblock(got.br_startblock)) {
4712 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
4713 mval->br_startblock = DELAYSTARTBLOCK;
4714 } else
4715 mval->br_startblock =
4716 got.br_startblock +
4717 (bno - got.br_startoff);
4718 /*
4719 * Return the minimum of what we got and what we
4720 * asked for for the length. We can use the len
4721 * variable here because it is modified below
4722 * and we could have been there before coming
4723 * here if the first part of the allocation
4724 * didn't overlap what was asked for.
4725 */
4726 mval->br_blockcount =
4727 XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
4728 (bno - got.br_startoff));
4729 mval->br_state = got.br_state;
4730 ASSERT(mval->br_blockcount <= len);
4731 } else {
4732 *mval = got;
4733 if (isnullstartblock(mval->br_startblock)) {
4734 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
4735 mval->br_startblock = DELAYSTARTBLOCK;
4736 }
4737 } 4929 }
4738 4930
4739 /* 4931 /* Deal with the allocated space we found. */
4740 * Check if writing previously allocated but 4932 xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
4741 * unwritten extents. 4933 end, n, flags);
4742 */ 4934
4743 if (wr && 4935 /* Execute unwritten extent conversion if necessary */
4744 ((mval->br_state == XFS_EXT_UNWRITTEN && 4936 error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
4745 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || 4937 if (error == EAGAIN)
4746 (mval->br_state == XFS_EXT_NORM && 4938 continue;
4747 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == 4939 if (error)
4748 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { 4940 goto error0;
4749 /* 4941
4750 * Modify (by adding) the state flag, if writing. 4942 /* update the extent map to return */
4751 */ 4943 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4752 ASSERT(mval->br_blockcount <= len);
4753 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4754 cur = xfs_bmbt_init_cursor(mp,
4755 tp, ip, whichfork);
4756 cur->bc_private.b.firstblock =
4757 *firstblock;
4758 cur->bc_private.b.flist = flist;
4759 }
4760 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4761 ? XFS_EXT_NORM
4762 : XFS_EXT_UNWRITTEN;
4763 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
4764 firstblock, flist, &tmp_logflags,
4765 whichfork);
4766 logflags |= tmp_logflags;
4767 if (error)
4768 goto error0;
4769 ep = xfs_iext_get_ext(ifp, lastx);
4770 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4771 xfs_bmbt_get_all(ep, &got);
4772 /*
4773 * We may have combined previously unwritten
4774 * space with written space, so generate
4775 * another request.
4776 */
4777 if (mval->br_blockcount < len)
4778 continue;
4779 }
4780 4944
4781 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4782 ((mval->br_startoff + mval->br_blockcount) <= end));
4783 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4784 (mval->br_blockcount <= len) ||
4785 (mval->br_startoff < obno));
4786 bno = mval->br_startoff + mval->br_blockcount;
4787 len = end - bno;
4788 if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
4789 ASSERT(mval->br_startblock == mval[-1].br_startblock);
4790 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
4791 ASSERT(mval->br_state == mval[-1].br_state);
4792 mval[-1].br_blockcount = mval->br_blockcount;
4793 mval[-1].br_state = mval->br_state;
4794 } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
4795 mval[-1].br_startblock != DELAYSTARTBLOCK &&
4796 mval[-1].br_startblock != HOLESTARTBLOCK &&
4797 mval->br_startblock ==
4798 mval[-1].br_startblock + mval[-1].br_blockcount &&
4799 ((flags & XFS_BMAPI_IGSTATE) ||
4800 mval[-1].br_state == mval->br_state)) {
4801 ASSERT(mval->br_startoff ==
4802 mval[-1].br_startoff + mval[-1].br_blockcount);
4803 mval[-1].br_blockcount += mval->br_blockcount;
4804 } else if (n > 0 &&
4805 mval->br_startblock == DELAYSTARTBLOCK &&
4806 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4807 mval->br_startoff ==
4808 mval[-1].br_startoff + mval[-1].br_blockcount) {
4809 mval[-1].br_blockcount += mval->br_blockcount;
4810 mval[-1].br_state = mval->br_state;
4811 } else if (!((n == 0) &&
4812 ((mval->br_startoff + mval->br_blockcount) <=
4813 obno))) {
4814 mval++;
4815 n++;
4816 }
4817 /* 4945 /*
4818 * If we're done, stop now. Stop when we've allocated 4946 * If we're done, stop now. Stop when we've allocated
4819 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise 4947 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
4820 * the transaction may get too big. 4948 * the transaction may get too big.
4821 */ 4949 */
4822 if (bno >= end || n >= *nmap || nallocs >= *nmap) 4950 if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
4823 break; 4951 break;
4824 /* 4952
4825 * Else go on to the next record. 4953 /* Else go on to the next record. */
4826 */ 4954 bma.prev = bma.got;
4827 prev = got; 4955 if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
4828 if (++lastx < nextents) { 4956 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
4829 ep = xfs_iext_get_ext(ifp, lastx); 4957 &bma.got);
4830 xfs_bmbt_get_all(ep, &got); 4958 } else
4831 } else {
4832 eof = 1; 4959 eof = 1;
4833 }
4834 } 4960 }
4835 *nmap = n; 4961 *nmap = n;
4962
4836 /* 4963 /*
4837 * Transform from btree to extents, give it cur. 4964 * Transform from btree to extents, give it cur.
4838 */ 4965 */
4839 if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 4966 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
4840 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { 4967 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
4841 ASSERT(wr && cur); 4968 int tmp_logflags = 0;
4842 error = xfs_bmap_btree_to_extents(tp, ip, cur, 4969
4970 ASSERT(bma.cur);
4971 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
4843 &tmp_logflags, whichfork); 4972 &tmp_logflags, whichfork);
4844 logflags |= tmp_logflags; 4973 bma.logflags |= tmp_logflags;
4845 if (error) 4974 if (error)
4846 goto error0; 4975 goto error0;
4847 } 4976 }
@@ -4855,34 +4984,33 @@ error0:
4855 * Log everything. Do this after conversion, there's no point in 4984 * Log everything. Do this after conversion, there's no point in
4856 * logging the extent records if we've converted to btree format. 4985 * logging the extent records if we've converted to btree format.
4857 */ 4986 */
4858 if ((logflags & xfs_ilog_fext(whichfork)) && 4987 if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
4859 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 4988 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4860 logflags &= ~xfs_ilog_fext(whichfork); 4989 bma.logflags &= ~xfs_ilog_fext(whichfork);
4861 else if ((logflags & xfs_ilog_fbroot(whichfork)) && 4990 else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
4862 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 4991 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
4863 logflags &= ~xfs_ilog_fbroot(whichfork); 4992 bma.logflags &= ~xfs_ilog_fbroot(whichfork);
4864 /* 4993 /*
4865 * Log whatever the flags say, even if error. Otherwise we might miss 4994 * Log whatever the flags say, even if error. Otherwise we might miss
4866 * detecting a case where the data is changed, there's an error, 4995 * detecting a case where the data is changed, there's an error,
4867 * and it's not logged so we don't shutdown when we should. 4996 * and it's not logged so we don't shutdown when we should.
4868 */ 4997 */
4869 if (logflags) { 4998 if (bma.logflags)
4870 ASSERT(tp && wr); 4999 xfs_trans_log_inode(tp, ip, bma.logflags);
4871 xfs_trans_log_inode(tp, ip, logflags); 5000
4872 } 5001 if (bma.cur) {
4873 if (cur) {
4874 if (!error) { 5002 if (!error) {
4875 ASSERT(*firstblock == NULLFSBLOCK || 5003 ASSERT(*firstblock == NULLFSBLOCK ||
4876 XFS_FSB_TO_AGNO(mp, *firstblock) == 5004 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4877 XFS_FSB_TO_AGNO(mp, 5005 XFS_FSB_TO_AGNO(mp,
4878 cur->bc_private.b.firstblock) || 5006 bma.cur->bc_private.b.firstblock) ||
4879 (flist->xbf_low && 5007 (flist->xbf_low &&
4880 XFS_FSB_TO_AGNO(mp, *firstblock) < 5008 XFS_FSB_TO_AGNO(mp, *firstblock) <
4881 XFS_FSB_TO_AGNO(mp, 5009 XFS_FSB_TO_AGNO(mp,
4882 cur->bc_private.b.firstblock))); 5010 bma.cur->bc_private.b.firstblock)));
4883 *firstblock = cur->bc_private.b.firstblock; 5011 *firstblock = bma.cur->bc_private.b.firstblock;
4884 } 5012 }
4885 xfs_btree_del_cursor(cur, 5013 xfs_btree_del_cursor(bma.cur,
4886 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 5014 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
4887 } 5015 }
4888 if (!error) 5016 if (!error)
@@ -4892,58 +5020,6 @@ error0:
4892} 5020}
4893 5021
4894/* 5022/*
4895 * Map file blocks to filesystem blocks, simple version.
4896 * One block (extent) only, read-only.
4897 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
4898 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
4899 * was set and all the others were clear.
4900 */
4901int /* error */
4902xfs_bmapi_single(
4903 xfs_trans_t *tp, /* transaction pointer */
4904 xfs_inode_t *ip, /* incore inode */
4905 int whichfork, /* data or attr fork */
4906 xfs_fsblock_t *fsb, /* output: mapped block */
4907 xfs_fileoff_t bno) /* starting file offs. mapped */
4908{
4909 int eof; /* we've hit the end of extents */
4910 int error; /* error return */
4911 xfs_bmbt_irec_t got; /* current file extent record */
4912 xfs_ifork_t *ifp; /* inode fork pointer */
4913 xfs_extnum_t lastx; /* last useful extent number */
4914 xfs_bmbt_irec_t prev; /* previous file extent record */
4915
4916 ifp = XFS_IFORK_PTR(ip, whichfork);
4917 if (unlikely(
4918 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4919 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
4920 XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
4921 ip->i_mount);
4922 return XFS_ERROR(EFSCORRUPTED);
4923 }
4924 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
4925 return XFS_ERROR(EIO);
4926 XFS_STATS_INC(xs_blk_mapr);
4927 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4928 (error = xfs_iread_extents(tp, ip, whichfork)))
4929 return error;
4930 (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
4931 &prev);
4932 /*
4933 * Reading past eof, act as though there's a hole
4934 * up to end.
4935 */
4936 if (eof || got.br_startoff > bno) {
4937 *fsb = NULLFSBLOCK;
4938 return 0;
4939 }
4940 ASSERT(!isnullstartblock(got.br_startblock));
4941 ASSERT(bno < got.br_startoff + got.br_blockcount);
4942 *fsb = got.br_startblock + (bno - got.br_startoff);
4943 return 0;
4944}
4945
4946/*
4947 * Unmap (remove) blocks from a file. 5023 * Unmap (remove) blocks from a file.
4948 * If nexts is nonzero then the number of extents to remove is limited to 5024 * If nexts is nonzero then the number of extents to remove is limited to
4949 * that value. If not all extents in the block range can be removed then 5025 * that value. If not all extents in the block range can be removed then
@@ -5114,9 +5190,9 @@ xfs_bunmapi(
5114 del.br_blockcount = mod; 5190 del.br_blockcount = mod;
5115 } 5191 }
5116 del.br_state = XFS_EXT_UNWRITTEN; 5192 del.br_state = XFS_EXT_UNWRITTEN;
5117 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, 5193 error = xfs_bmap_add_extent_unwritten_real(tp, ip,
5118 firstblock, flist, &logflags, 5194 &lastx, &cur, &del, firstblock, flist,
5119 XFS_DATA_FORK); 5195 &logflags);
5120 if (error) 5196 if (error)
5121 goto error0; 5197 goto error0;
5122 goto nodelete; 5198 goto nodelete;
@@ -5172,18 +5248,18 @@ xfs_bunmapi(
5172 } 5248 }
5173 prev.br_state = XFS_EXT_UNWRITTEN; 5249 prev.br_state = XFS_EXT_UNWRITTEN;
5174 lastx--; 5250 lastx--;
5175 error = xfs_bmap_add_extent(tp, ip, &lastx, 5251 error = xfs_bmap_add_extent_unwritten_real(tp,
5176 &cur, &prev, firstblock, flist, 5252 ip, &lastx, &cur, &prev,
5177 &logflags, XFS_DATA_FORK); 5253 firstblock, flist, &logflags);
5178 if (error) 5254 if (error)
5179 goto error0; 5255 goto error0;
5180 goto nodelete; 5256 goto nodelete;
5181 } else { 5257 } else {
5182 ASSERT(del.br_state == XFS_EXT_NORM); 5258 ASSERT(del.br_state == XFS_EXT_NORM);
5183 del.br_state = XFS_EXT_UNWRITTEN; 5259 del.br_state = XFS_EXT_UNWRITTEN;
5184 error = xfs_bmap_add_extent(tp, ip, &lastx, 5260 error = xfs_bmap_add_extent_unwritten_real(tp,
5185 &cur, &del, firstblock, flist, 5261 ip, &lastx, &cur, &del,
5186 &logflags, XFS_DATA_FORK); 5262 firstblock, flist, &logflags);
5187 if (error) 5263 if (error)
5188 goto error0; 5264 goto error0;
5189 goto nodelete; 5265 goto nodelete;
@@ -5505,10 +5581,9 @@ xfs_getbmap(
5505 5581
5506 do { 5582 do {
5507 nmap = (nexleft > subnex) ? subnex : nexleft; 5583 nmap = (nexleft > subnex) ? subnex : nexleft;
5508 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), 5584 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5509 XFS_BB_TO_FSB(mp, bmv->bmv_length), 5585 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5510 bmapi_flags, NULL, 0, map, &nmap, 5586 map, &nmap, bmapi_flags);
5511 NULL);
5512 if (error) 5587 if (error)
5513 goto out_free_map; 5588 goto out_free_map;
5514 ASSERT(nmap <= subnex); 5589 ASSERT(nmap <= subnex);
@@ -5582,89 +5657,6 @@ xfs_getbmap(
5582 return error; 5657 return error;
5583} 5658}
5584 5659
5585/*
5586 * Check the last inode extent to determine whether this allocation will result
5587 * in blocks being allocated at the end of the file. When we allocate new data
5588 * blocks at the end of the file which do not start at the previous data block,
5589 * we will try to align the new blocks at stripe unit boundaries.
5590 */
5591STATIC int /* error */
5592xfs_bmap_isaeof(
5593 xfs_inode_t *ip, /* incore inode pointer */
5594 xfs_fileoff_t off, /* file offset in fsblocks */
5595 int whichfork, /* data or attribute fork */
5596 char *aeof) /* return value */
5597{
5598 int error; /* error return value */
5599 xfs_ifork_t *ifp; /* inode fork pointer */
5600 xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
5601 xfs_extnum_t nextents; /* number of file extents */
5602 xfs_bmbt_irec_t s; /* expanded extent record */
5603
5604 ASSERT(whichfork == XFS_DATA_FORK);
5605 ifp = XFS_IFORK_PTR(ip, whichfork);
5606 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5607 (error = xfs_iread_extents(NULL, ip, whichfork)))
5608 return error;
5609 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5610 if (nextents == 0) {
5611 *aeof = 1;
5612 return 0;
5613 }
5614 /*
5615 * Go to the last extent
5616 */
5617 lastrec = xfs_iext_get_ext(ifp, nextents - 1);
5618 xfs_bmbt_get_all(lastrec, &s);
5619 /*
5620 * Check we are allocating in the last extent (for delayed allocations)
5621 * or past the last extent for non-delayed allocations.
5622 */
5623 *aeof = (off >= s.br_startoff &&
5624 off < s.br_startoff + s.br_blockcount &&
5625 isnullstartblock(s.br_startblock)) ||
5626 off >= s.br_startoff + s.br_blockcount;
5627 return 0;
5628}
5629
5630/*
5631 * Check if the endoff is outside the last extent. If so the caller will grow
5632 * the allocation to a stripe unit boundary.
5633 */
5634int /* error */
5635xfs_bmap_eof(
5636 xfs_inode_t *ip, /* incore inode pointer */
5637 xfs_fileoff_t endoff, /* file offset in fsblocks */
5638 int whichfork, /* data or attribute fork */
5639 int *eof) /* result value */
5640{
5641 xfs_fsblock_t blockcount; /* extent block count */
5642 int error; /* error return value */
5643 xfs_ifork_t *ifp; /* inode fork pointer */
5644 xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
5645 xfs_extnum_t nextents; /* number of file extents */
5646 xfs_fileoff_t startoff; /* extent starting file offset */
5647
5648 ASSERT(whichfork == XFS_DATA_FORK);
5649 ifp = XFS_IFORK_PTR(ip, whichfork);
5650 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5651 (error = xfs_iread_extents(NULL, ip, whichfork)))
5652 return error;
5653 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5654 if (nextents == 0) {
5655 *eof = 1;
5656 return 0;
5657 }
5658 /*
5659 * Go to the last extent
5660 */
5661 lastrec = xfs_iext_get_ext(ifp, nextents - 1);
5662 startoff = xfs_bmbt_get_startoff(lastrec);
5663 blockcount = xfs_bmbt_get_blockcount(lastrec);
5664 *eof = endoff >= startoff + blockcount;
5665 return 0;
5666}
5667
5668#ifdef DEBUG 5660#ifdef DEBUG
5669STATIC struct xfs_buf * 5661STATIC struct xfs_buf *
5670xfs_bmap_get_bp( 5662xfs_bmap_get_bp(
@@ -6099,9 +6091,8 @@ xfs_bmap_punch_delalloc_range(
6099 * trying to remove a real extent (which requires a 6091 * trying to remove a real extent (which requires a
6100 * transaction) or a hole, which is probably a bad idea... 6092 * transaction) or a hole, which is probably a bad idea...
6101 */ 6093 */
6102 error = xfs_bmapi(NULL, ip, start_fsb, 1, 6094 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
6103 XFS_BMAPI_ENTIRE, NULL, 0, &imap, 6095 XFS_BMAPI_ENTIRE);
6104 &nimaps, NULL);
6105 6096
6106 if (error) { 6097 if (error) {
6107 /* something screwed, just bail */ 6098 /* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde05..89ee672d378 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,27 +62,23 @@ typedef struct xfs_bmap_free
62#define XFS_BMAP_MAX_NMAP 4 62#define XFS_BMAP_MAX_NMAP 4
63 63
64/* 64/*
65 * Flags for xfs_bmapi 65 * Flags for xfs_bmapi_*
66 */ 66 */
67#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ 67#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
68#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ 68#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ 71#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
72#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
73#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
74 /* combine contig. space */ 72 /* combine contig. space */
75#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 73#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
76/* 74/*
77 * unwritten extent conversion - this needs write cache flushing and no additional 75 * unwritten extent conversion - this needs write cache flushing and no additional
78 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts 76 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
79 * from written to unwritten, otherwise convert from unwritten to written. 77 * from written to unwritten, otherwise convert from unwritten to written.
80 */ 78 */
81#define XFS_BMAPI_CONVERT 0x200 79#define XFS_BMAPI_CONVERT 0x040
82 80
83#define XFS_BMAPI_FLAGS \ 81#define XFS_BMAPI_FLAGS \
84 { XFS_BMAPI_WRITE, "WRITE" }, \
85 { XFS_BMAPI_DELAY, "DELAY" }, \
86 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 82 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
87 { XFS_BMAPI_METADATA, "METADATA" }, \ 83 { XFS_BMAPI_METADATA, "METADATA" }, \
88 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 84 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
@@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
113 * Argument structure for xfs_bmap_alloc. 109 * Argument structure for xfs_bmap_alloc.
114 */ 110 */
115typedef struct xfs_bmalloca { 111typedef struct xfs_bmalloca {
116 xfs_fsblock_t firstblock; /* i/o first block allocated */ 112 xfs_fsblock_t *firstblock; /* i/o first block allocated */
117 xfs_fsblock_t rval; /* starting block of new extent */ 113 struct xfs_bmap_free *flist; /* bmap freelist */
118 xfs_fileoff_t off; /* offset in file filling in */
119 struct xfs_trans *tp; /* transaction pointer */ 114 struct xfs_trans *tp; /* transaction pointer */
120 struct xfs_inode *ip; /* incore inode pointer */ 115 struct xfs_inode *ip; /* incore inode pointer */
121 struct xfs_bmbt_irec *prevp; /* extent before the new one */ 116 struct xfs_bmbt_irec prev; /* extent before the new one */
122 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ 117 struct xfs_bmbt_irec got; /* extent after, or delayed */
123 xfs_extlen_t alen; /* i/o length asked/allocated */ 118
119 xfs_fileoff_t offset; /* offset in file filling in */
120 xfs_extlen_t length; /* i/o length asked/allocated */
121 xfs_fsblock_t blkno; /* starting block of new extent */
122
123 struct xfs_btree_cur *cur; /* btree cursor */
124 xfs_extnum_t idx; /* current extent index */
125 int nallocs;/* number of extents alloc'd */
126 int logflags;/* flags for transaction logging */
127
124 xfs_extlen_t total; /* total blocks needed for xaction */ 128 xfs_extlen_t total; /* total blocks needed for xaction */
125 xfs_extlen_t minlen; /* minimum allocation size (blocks) */ 129 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
126 xfs_extlen_t minleft; /* amount must be left after alloc */ 130 xfs_extlen_t minleft; /* amount must be left after alloc */
127 char eof; /* set if allocating past last extent */ 131 char eof; /* set if allocating past last extent */
128 char wasdel; /* replacing a delayed allocation */ 132 char wasdel; /* replacing a delayed allocation */
129 char userdata;/* set if is user data */ 133 char userdata;/* set if is user data */
130 char low; /* low on space, using seq'l ags */
131 char aeof; /* allocated space at eof */ 134 char aeof; /* allocated space at eof */
132 char conv; /* overwriting unwritten extents */ 135 char conv; /* overwriting unwritten extents */
133} xfs_bmalloca_t; 136} xfs_bmalloca_t;
@@ -152,251 +155,62 @@ typedef struct xfs_bmalloca {
152 { BMAP_RIGHT_FILLING, "RF" }, \ 155 { BMAP_RIGHT_FILLING, "RF" }, \
153 { BMAP_ATTRFORK, "ATTR" } 156 { BMAP_ATTRFORK, "ATTR" }
154 157
155/*
156 * Add bmap trace insert entries for all the contents of the extent list.
157 *
158 * Quite excessive tracing. Only do this for debug builds.
159 */
160#if defined(__KERNEL) && defined(DEBUG) 158#if defined(__KERNEL) && defined(DEBUG)
161void 159void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
162xfs_bmap_trace_exlist( 160 int whichfork, unsigned long caller_ip);
163 struct xfs_inode *ip, /* incore inode pointer */
164 xfs_extnum_t cnt, /* count of entries in list */
165 int whichfork,
166 unsigned long caller_ip); /* data or attr fork */
167#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 161#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
168 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) 162 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
169#else 163#else
170#define XFS_BMAP_TRACE_EXLIST(ip,c,w) 164#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
171#endif 165#endif
172 166
173/* 167int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
174 * Convert inode from non-attributed to attributed. 168void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
175 * Must not be in a transaction, ip must not be locked. 169 struct xfs_bmap_free *flist, struct xfs_mount *mp);
176 */ 170void xfs_bmap_cancel(struct xfs_bmap_free *flist);
177int /* error code */ 171void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
178xfs_bmap_add_attrfork( 172int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
179 struct xfs_inode *ip, /* incore inode pointer */ 173 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
180 int size, /* space needed for new attribute */ 174int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
181 int rsvd); /* flag for reserved block allocation */ 175 xfs_fileoff_t *last_block, int whichfork);
182 176int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
183/* 177 xfs_fileoff_t *unused, int whichfork);
184 * Add the extent to the list of extents to be free at transaction end. 178int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
185 * The list is maintained sorted (by block number). 179int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
186 */ 180 int whichfork);
187void 181int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
188xfs_bmap_add_free( 182 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
189 xfs_fsblock_t bno, /* fs block number of extent */ 183 int *nmap, int flags);
190 xfs_filblks_t len, /* length of extent */ 184int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
191 xfs_bmap_free_t *flist, /* list of extents */ 185 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
192 struct xfs_mount *mp); /* mount point structure */ 186 int *nmap, int flags);
193 187int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
194/* 188 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
195 * Routine to clean up the free list data structure when 189 xfs_fsblock_t *firstblock, xfs_extlen_t total,
196 * an error occurs during a transaction. 190 struct xfs_bmbt_irec *mval, int *nmap,
197 */ 191 struct xfs_bmap_free *flist);
198void 192int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
199xfs_bmap_cancel( 193 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
200 xfs_bmap_free_t *flist); /* free list to clean up */ 194 xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
201 195 struct xfs_bmap_free *flist, int *done);
202/* 196int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
203 * Compute and fill in the value of the maximum depth of a bmap btree 197 xfs_extnum_t num);
204 * in this filesystem. Done once, during mount. 198uint xfs_default_attroffset(struct xfs_inode *ip);
205 */
206void
207xfs_bmap_compute_maxlevels(
208 struct xfs_mount *mp, /* file system mount structure */
209 int whichfork); /* data or attr fork */
210
211/*
212 * Returns the file-relative block number of the first unused block in the file.
213 * This is the lowest-address hole if the file has holes, else the first block
214 * past the end of file.
215 */
216int /* error */
217xfs_bmap_first_unused(
218 struct xfs_trans *tp, /* transaction pointer */
219 struct xfs_inode *ip, /* incore inode */
220 xfs_extlen_t len, /* size of hole to find */
221 xfs_fileoff_t *unused, /* unused block num */
222 int whichfork); /* data or attr fork */
223
224/*
225 * Returns the file-relative block number of the last block + 1 before
226 * last_block (input value) in the file.
227 * This is not based on i_size, it is based on the extent list.
228 * Returns 0 for local files, as they do not have an extent list.
229 */
230int /* error */
231xfs_bmap_last_before(
232 struct xfs_trans *tp, /* transaction pointer */
233 struct xfs_inode *ip, /* incore inode */
234 xfs_fileoff_t *last_block, /* last block */
235 int whichfork); /* data or attr fork */
236
237/*
238 * Returns the file-relative block number of the first block past eof in
239 * the file. This is not based on i_size, it is based on the extent list.
240 * Returns 0 for local files, as they do not have an extent list.
241 */
242int /* error */
243xfs_bmap_last_offset(
244 struct xfs_trans *tp, /* transaction pointer */
245 struct xfs_inode *ip, /* incore inode */
246 xfs_fileoff_t *unused, /* last block num */
247 int whichfork); /* data or attr fork */
248
249/*
250 * Returns whether the selected fork of the inode has exactly one
251 * block or not. For the data fork we check this matches di_size,
252 * implying the file's range is 0..bsize-1.
253 */
254int
255xfs_bmap_one_block(
256 struct xfs_inode *ip, /* incore inode */
257 int whichfork); /* data or attr fork */
258
259/*
260 * Read in the extents to iu_extents.
261 * All inode fields are set up by caller, we just traverse the btree
262 * and copy the records in.
263 */
264int /* error */
265xfs_bmap_read_extents(
266 struct xfs_trans *tp, /* transaction pointer */
267 struct xfs_inode *ip, /* incore inode */
268 int whichfork); /* data or attr fork */
269
270/*
271 * Map file blocks to filesystem blocks.
272 * File range is given by the bno/len pair.
273 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
274 * into a hole or past eof.
275 * Only allocates blocks from a single allocation group,
276 * to avoid locking problems.
277 * The returned value in "firstblock" from the first call in a transaction
278 * must be remembered and presented to subsequent calls in "firstblock".
279 * An upper bound for the number of blocks to be allocated is supplied to
280 * the first call in "total"; if no allocation group has that many free
281 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
282 */
283int /* error */
284xfs_bmapi(
285 struct xfs_trans *tp, /* transaction pointer */
286 struct xfs_inode *ip, /* incore inode */
287 xfs_fileoff_t bno, /* starting file offs. mapped */
288 xfs_filblks_t len, /* length to map in file */
289 int flags, /* XFS_BMAPI_... */
290 xfs_fsblock_t *firstblock, /* first allocated block
291 controls a.g. for allocs */
292 xfs_extlen_t total, /* total blocks needed */
293 struct xfs_bmbt_irec *mval, /* output: map values */
294 int *nmap, /* i/o: mval size/count */
295 xfs_bmap_free_t *flist); /* i/o: list extents to free */
296
297/*
298 * Map file blocks to filesystem blocks, simple version.
299 * One block only, read-only.
300 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
301 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
302 * was set and all the others were clear.
303 */
304int /* error */
305xfs_bmapi_single(
306 struct xfs_trans *tp, /* transaction pointer */
307 struct xfs_inode *ip, /* incore inode */
308 int whichfork, /* data or attr fork */
309 xfs_fsblock_t *fsb, /* output: mapped block */
310 xfs_fileoff_t bno); /* starting file offs. mapped */
311
312/*
313 * Unmap (remove) blocks from a file.
314 * If nexts is nonzero then the number of extents to remove is limited to
315 * that value. If not all extents in the block range can be removed then
316 * *done is set.
317 */
318int /* error */
319xfs_bunmapi(
320 struct xfs_trans *tp, /* transaction pointer */
321 struct xfs_inode *ip, /* incore inode */
322 xfs_fileoff_t bno, /* starting offset to unmap */
323 xfs_filblks_t len, /* length to unmap in file */
324 int flags, /* XFS_BMAPI_... */
325 xfs_extnum_t nexts, /* number of extents max */
326 xfs_fsblock_t *firstblock, /* first allocated block
327 controls a.g. for allocs */
328 xfs_bmap_free_t *flist, /* i/o: list extents to free */
329 int *done); /* set if not done yet */
330
331/*
332 * Check an extent list, which has just been read, for
333 * any bit in the extent flag field.
334 */
335int
336xfs_check_nostate_extents(
337 struct xfs_ifork *ifp,
338 xfs_extnum_t idx,
339 xfs_extnum_t num);
340
341uint
342xfs_default_attroffset(
343 struct xfs_inode *ip);
344 199
345#ifdef __KERNEL__ 200#ifdef __KERNEL__
346
347/*
348 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
349 * caller. Frees all the extents that need freeing, which must be done
350 * last due to locking considerations.
351 *
352 * Return 1 if the given transaction was committed and a new one allocated,
353 * and 0 otherwise.
354 */
355int /* error */
356xfs_bmap_finish(
357 struct xfs_trans **tp, /* transaction pointer addr */
358 xfs_bmap_free_t *flist, /* i/o: list extents to free */
359 int *committed); /* xact committed or not */
360
361/* bmap to userspace formatter - copy to user & advance pointer */ 201/* bmap to userspace formatter - copy to user & advance pointer */
362typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); 202typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
363 203
364/* 204int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
365 * Get inode's extents as described in bmv, and format for output. 205 int *committed);
366 */ 206int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
367int /* error code */ 207 xfs_bmap_format_t formatter, void *arg);
368xfs_getbmap( 208int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
369 xfs_inode_t *ip, 209 int whichfork, int *eof);
370 struct getbmapx *bmv, /* user bmap structure */ 210int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
371 xfs_bmap_format_t formatter, /* format to user */ 211 int whichfork, int *count);
372 void *arg); /* formatter arg */ 212int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
373 213 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
374/*
375 * Check if the endoff is outside the last extent. If so the caller will grow
376 * the allocation to a stripe unit boundary
377 */
378int
379xfs_bmap_eof(
380 struct xfs_inode *ip,
381 xfs_fileoff_t endoff,
382 int whichfork,
383 int *eof);
384
385/*
386 * Count fsblocks of the given fork.
387 */
388int
389xfs_bmap_count_blocks(
390 xfs_trans_t *tp,
391 struct xfs_inode *ip,
392 int whichfork,
393 int *count);
394
395int
396xfs_bmap_punch_delalloc_range(
397 struct xfs_inode *ip,
398 xfs_fileoff_t start_fsb,
399 xfs_fileoff_t length);
400#endif /* __KERNEL__ */ 214#endif /* __KERNEL__ */
401 215
402#endif /* __XFS_BMAP_H__ */ 216#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2b9fd385e27..1f19f03af9d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -631,7 +631,7 @@ xfs_btree_read_bufl(
631 } 631 }
632 ASSERT(!xfs_buf_geterror(bp)); 632 ASSERT(!xfs_buf_geterror(bp));
633 if (bp) 633 if (bp)
634 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 634 xfs_buf_set_ref(bp, refval);
635 *bpp = bp; 635 *bpp = bp;
636 return 0; 636 return 0;
637} 637}
@@ -939,13 +939,13 @@ xfs_btree_set_refs(
939 switch (cur->bc_btnum) { 939 switch (cur->bc_btnum) {
940 case XFS_BTNUM_BNO: 940 case XFS_BTNUM_BNO:
941 case XFS_BTNUM_CNT: 941 case XFS_BTNUM_CNT:
942 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 942 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
943 break; 943 break;
944 case XFS_BTNUM_INO: 944 case XFS_BTNUM_INO:
945 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); 945 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
946 break; 946 break;
947 case XFS_BTNUM_BMAP: 947 case XFS_BTNUM_BMAP:
948 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); 948 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
949 break; 949 break;
950 default: 950 default:
951 ASSERT(0); 951 ASSERT(0);
@@ -970,7 +970,8 @@ xfs_btree_get_buf_block(
970 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 970 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
971 mp->m_bsize, flags); 971 mp->m_bsize, flags);
972 972
973 ASSERT(!xfs_buf_geterror(*bpp)); 973 if (!*bpp)
974 return ENOMEM;
974 975
975 *block = XFS_BUF_TO_BLOCK(*bpp); 976 *block = XFS_BUF_TO_BLOCK(*bpp);
976 return 0; 977 return 0;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c57836dc778..cf0ac056815 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
43 43
44static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *); 45STATIC int xfsbufd(void *);
46STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
47 46
48static struct workqueue_struct *xfslogd_workqueue; 47static struct workqueue_struct *xfslogd_workqueue;
49struct workqueue_struct *xfsdatad_workqueue; 48struct workqueue_struct *xfsdatad_workqueue;
@@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
66#define xb_to_km(flags) \ 65#define xb_to_km(flags) \
67 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 66 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
68 67
69#define xfs_buf_allocate(flags) \
70 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
71#define xfs_buf_deallocate(bp) \
72 kmem_zone_free(xfs_buf_zone, (bp));
73 68
74static inline int 69static inline int
75xfs_buf_is_vmapped( 70xfs_buf_is_vmapped(
@@ -152,6 +147,7 @@ xfs_buf_stale(
152 struct xfs_buf *bp) 147 struct xfs_buf *bp)
153{ 148{
154 bp->b_flags |= XBF_STALE; 149 bp->b_flags |= XBF_STALE;
150 xfs_buf_delwri_dequeue(bp);
155 atomic_set(&(bp)->b_lru_ref, 0); 151 atomic_set(&(bp)->b_lru_ref, 0);
156 if (!list_empty(&bp->b_lru)) { 152 if (!list_empty(&bp->b_lru)) {
157 struct xfs_buftarg *btp = bp->b_target; 153 struct xfs_buftarg *btp = bp->b_target;
@@ -167,14 +163,19 @@ xfs_buf_stale(
167 ASSERT(atomic_read(&bp->b_hold) >= 1); 163 ASSERT(atomic_read(&bp->b_hold) >= 1);
168} 164}
169 165
170STATIC void 166struct xfs_buf *
171_xfs_buf_initialize( 167xfs_buf_alloc(
172 xfs_buf_t *bp, 168 struct xfs_buftarg *target,
173 xfs_buftarg_t *target,
174 xfs_off_t range_base, 169 xfs_off_t range_base,
175 size_t range_length, 170 size_t range_length,
176 xfs_buf_flags_t flags) 171 xfs_buf_flags_t flags)
177{ 172{
173 struct xfs_buf *bp;
174
175 bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
176 if (unlikely(!bp))
177 return NULL;
178
178 /* 179 /*
179 * We don't want certain flags to appear in b_flags. 180 * We don't want certain flags to appear in b_flags.
180 */ 181 */
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
203 init_waitqueue_head(&bp->b_waiters); 204 init_waitqueue_head(&bp->b_waiters);
204 205
205 XFS_STATS_INC(xb_create); 206 XFS_STATS_INC(xb_create);
206
207 trace_xfs_buf_init(bp, _RET_IP_); 207 trace_xfs_buf_init(bp, _RET_IP_);
208
209 return bp;
208} 210}
209 211
210/* 212/*
@@ -277,7 +279,7 @@ xfs_buf_free(
277 } else if (bp->b_flags & _XBF_KMEM) 279 } else if (bp->b_flags & _XBF_KMEM)
278 kmem_free(bp->b_addr); 280 kmem_free(bp->b_addr);
279 _xfs_buf_free_pages(bp); 281 _xfs_buf_free_pages(bp);
280 xfs_buf_deallocate(bp); 282 kmem_zone_free(xfs_buf_zone, bp);
281} 283}
282 284
283/* 285/*
@@ -416,10 +418,7 @@ _xfs_buf_map_pages(
416/* 418/*
417 * Look up, and creates if absent, a lockable buffer for 419 * Look up, and creates if absent, a lockable buffer for
418 * a given range of an inode. The buffer is returned 420 * a given range of an inode. The buffer is returned
419 * locked. If other overlapping buffers exist, they are 421 * locked. No I/O is implied by this call.
420 * released before the new buffer is created and locked,
421 * which may imply that this call will block until those buffers
422 * are unlocked. No I/O is implied by this call.
423 */ 422 */
424xfs_buf_t * 423xfs_buf_t *
425_xfs_buf_find( 424_xfs_buf_find(
@@ -481,8 +480,6 @@ _xfs_buf_find(
481 480
482 /* No match found */ 481 /* No match found */
483 if (new_bp) { 482 if (new_bp) {
484 _xfs_buf_initialize(new_bp, btp, range_base,
485 range_length, flags);
486 rb_link_node(&new_bp->b_rbnode, parent, rbp); 483 rb_link_node(&new_bp->b_rbnode, parent, rbp);
487 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 484 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
488 /* the buffer keeps the perag reference until it is freed */ 485 /* the buffer keeps the perag reference until it is freed */
@@ -525,35 +522,51 @@ found:
525} 522}
526 523
527/* 524/*
528 * Assembles a buffer covering the specified range. 525 * Assembles a buffer covering the specified range. The code is optimised for
529 * Storage in memory for all portions of the buffer will be allocated, 526 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
530 * although backing storage may not be. 527 * more hits than misses.
531 */ 528 */
532xfs_buf_t * 529struct xfs_buf *
533xfs_buf_get( 530xfs_buf_get(
534 xfs_buftarg_t *target,/* target for buffer */ 531 xfs_buftarg_t *target,/* target for buffer */
535 xfs_off_t ioff, /* starting offset of range */ 532 xfs_off_t ioff, /* starting offset of range */
536 size_t isize, /* length of range */ 533 size_t isize, /* length of range */
537 xfs_buf_flags_t flags) 534 xfs_buf_flags_t flags)
538{ 535{
539 xfs_buf_t *bp, *new_bp; 536 struct xfs_buf *bp;
537 struct xfs_buf *new_bp;
540 int error = 0; 538 int error = 0;
541 539
542 new_bp = xfs_buf_allocate(flags); 540 bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
541 if (likely(bp))
542 goto found;
543
544 new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
545 flags);
543 if (unlikely(!new_bp)) 546 if (unlikely(!new_bp))
544 return NULL; 547 return NULL;
545 548
546 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 549 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
550 if (!bp) {
551 kmem_zone_free(xfs_buf_zone, new_bp);
552 return NULL;
553 }
554
547 if (bp == new_bp) { 555 if (bp == new_bp) {
548 error = xfs_buf_allocate_memory(bp, flags); 556 error = xfs_buf_allocate_memory(bp, flags);
549 if (error) 557 if (error)
550 goto no_buffer; 558 goto no_buffer;
551 } else { 559 } else
552 xfs_buf_deallocate(new_bp); 560 kmem_zone_free(xfs_buf_zone, new_bp);
553 if (unlikely(bp == NULL))
554 return NULL;
555 }
556 561
562 /*
563 * Now we have a workable buffer, fill in the block number so
564 * that we can do IO on it.
565 */
566 bp->b_bn = ioff;
567 bp->b_count_desired = bp->b_buffer_length;
568
569found:
557 if (!(bp->b_flags & XBF_MAPPED)) { 570 if (!(bp->b_flags & XBF_MAPPED)) {
558 error = _xfs_buf_map_pages(bp, flags); 571 error = _xfs_buf_map_pages(bp, flags);
559 if (unlikely(error)) { 572 if (unlikely(error)) {
@@ -564,18 +577,10 @@ xfs_buf_get(
564 } 577 }
565 578
566 XFS_STATS_INC(xb_get); 579 XFS_STATS_INC(xb_get);
567
568 /*
569 * Always fill in the block number now, the mapped cases can do
570 * their own overlay of this later.
571 */
572 bp->b_bn = ioff;
573 bp->b_count_desired = bp->b_buffer_length;
574
575 trace_xfs_buf_get(bp, flags, _RET_IP_); 580 trace_xfs_buf_get(bp, flags, _RET_IP_);
576 return bp; 581 return bp;
577 582
578 no_buffer: 583no_buffer:
579 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 584 if (flags & (XBF_LOCK | XBF_TRYLOCK))
580 xfs_buf_unlock(bp); 585 xfs_buf_unlock(bp);
581 xfs_buf_rele(bp); 586 xfs_buf_rele(bp);
@@ -689,19 +694,6 @@ xfs_buf_read_uncached(
689 return bp; 694 return bp;
690} 695}
691 696
692xfs_buf_t *
693xfs_buf_get_empty(
694 size_t len,
695 xfs_buftarg_t *target)
696{
697 xfs_buf_t *bp;
698
699 bp = xfs_buf_allocate(0);
700 if (bp)
701 _xfs_buf_initialize(bp, target, 0, len, 0);
702 return bp;
703}
704
705/* 697/*
706 * Return a buffer allocated as an empty buffer and associated to external 698 * Return a buffer allocated as an empty buffer and associated to external
707 * memory via xfs_buf_associate_memory() back to it's empty state. 699 * memory via xfs_buf_associate_memory() back to it's empty state.
@@ -787,10 +779,9 @@ xfs_buf_get_uncached(
787 int error, i; 779 int error, i;
788 xfs_buf_t *bp; 780 xfs_buf_t *bp;
789 781
790 bp = xfs_buf_allocate(0); 782 bp = xfs_buf_alloc(target, 0, len, 0);
791 if (unlikely(bp == NULL)) 783 if (unlikely(bp == NULL))
792 goto fail; 784 goto fail;
793 _xfs_buf_initialize(bp, target, 0, len, 0);
794 785
795 error = _xfs_buf_get_pages(bp, page_count, 0); 786 error = _xfs_buf_get_pages(bp, page_count, 0);
796 if (error) 787 if (error)
@@ -818,7 +809,7 @@ xfs_buf_get_uncached(
818 __free_page(bp->b_pages[i]); 809 __free_page(bp->b_pages[i]);
819 _xfs_buf_free_pages(bp); 810 _xfs_buf_free_pages(bp);
820 fail_free_buf: 811 fail_free_buf:
821 xfs_buf_deallocate(bp); 812 kmem_zone_free(xfs_buf_zone, bp);
822 fail: 813 fail:
823 return NULL; 814 return NULL;
824} 815}
@@ -937,12 +928,6 @@ void
937xfs_buf_unlock( 928xfs_buf_unlock(
938 struct xfs_buf *bp) 929 struct xfs_buf *bp)
939{ 930{
940 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
941 atomic_inc(&bp->b_hold);
942 bp->b_flags |= XBF_ASYNC;
943 xfs_buf_delwri_queue(bp, 0);
944 }
945
946 XB_CLEAR_OWNER(bp); 931 XB_CLEAR_OWNER(bp);
947 up(&bp->b_sema); 932 up(&bp->b_sema);
948 933
@@ -1019,9 +1004,19 @@ xfs_buf_ioerror(
1019 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1004 trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1020} 1005}
1021 1006
1007void
1008xfs_buf_ioerror_alert(
1009 struct xfs_buf *bp,
1010 const char *func)
1011{
1012 xfs_alert(bp->b_target->bt_mount,
1013"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
1014 (__uint64_t)XFS_BUF_ADDR(bp), func,
1015 bp->b_error, XFS_BUF_COUNT(bp));
1016}
1017
1022int 1018int
1023xfs_bwrite( 1019xfs_bwrite(
1024 struct xfs_mount *mp,
1025 struct xfs_buf *bp) 1020 struct xfs_buf *bp)
1026{ 1021{
1027 int error; 1022 int error;
@@ -1033,25 +1028,13 @@ xfs_bwrite(
1033 xfs_bdstrat_cb(bp); 1028 xfs_bdstrat_cb(bp);
1034 1029
1035 error = xfs_buf_iowait(bp); 1030 error = xfs_buf_iowait(bp);
1036 if (error) 1031 if (error) {
1037 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1032 xfs_force_shutdown(bp->b_target->bt_mount,
1038 xfs_buf_relse(bp); 1033 SHUTDOWN_META_IO_ERROR);
1034 }
1039 return error; 1035 return error;
1040} 1036}
1041 1037
1042void
1043xfs_bdwrite(
1044 void *mp,
1045 struct xfs_buf *bp)
1046{
1047 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1048
1049 bp->b_flags &= ~XBF_READ;
1050 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1051
1052 xfs_buf_delwri_queue(bp, 1);
1053}
1054
1055/* 1038/*
1056 * Called when we want to stop a buffer from getting written or read. 1039 * Called when we want to stop a buffer from getting written or read.
1057 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 1040 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1074,9 +1057,8 @@ xfs_bioerror(
1074 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1057 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1075 */ 1058 */
1076 XFS_BUF_UNREAD(bp); 1059 XFS_BUF_UNREAD(bp);
1077 XFS_BUF_UNDELAYWRITE(bp);
1078 XFS_BUF_UNDONE(bp); 1060 XFS_BUF_UNDONE(bp);
1079 XFS_BUF_STALE(bp); 1061 xfs_buf_stale(bp);
1080 1062
1081 xfs_buf_ioend(bp, 0); 1063 xfs_buf_ioend(bp, 0);
1082 1064
@@ -1103,9 +1085,8 @@ xfs_bioerror_relse(
1103 * change that interface. 1085 * change that interface.
1104 */ 1086 */
1105 XFS_BUF_UNREAD(bp); 1087 XFS_BUF_UNREAD(bp);
1106 XFS_BUF_UNDELAYWRITE(bp);
1107 XFS_BUF_DONE(bp); 1088 XFS_BUF_DONE(bp);
1108 XFS_BUF_STALE(bp); 1089 xfs_buf_stale(bp);
1109 bp->b_iodone = NULL; 1090 bp->b_iodone = NULL;
1110 if (!(fl & XBF_ASYNC)) { 1091 if (!(fl & XBF_ASYNC)) {
1111 /* 1092 /*
@@ -1115,7 +1096,7 @@ xfs_bioerror_relse(
1115 * ASYNC buffers. 1096 * ASYNC buffers.
1116 */ 1097 */
1117 xfs_buf_ioerror(bp, EIO); 1098 xfs_buf_ioerror(bp, EIO);
1118 XFS_BUF_FINISH_IOWAIT(bp); 1099 complete(&bp->b_iowait);
1119 } else { 1100 } else {
1120 xfs_buf_relse(bp); 1101 xfs_buf_relse(bp);
1121 } 1102 }
@@ -1275,15 +1256,10 @@ xfs_buf_iorequest(
1275{ 1256{
1276 trace_xfs_buf_iorequest(bp, _RET_IP_); 1257 trace_xfs_buf_iorequest(bp, _RET_IP_);
1277 1258
1278 if (bp->b_flags & XBF_DELWRI) { 1259 ASSERT(!(bp->b_flags & XBF_DELWRI));
1279 xfs_buf_delwri_queue(bp, 1);
1280 return 0;
1281 }
1282 1260
1283 if (bp->b_flags & XBF_WRITE) { 1261 if (bp->b_flags & XBF_WRITE)
1284 xfs_buf_wait_unpin(bp); 1262 xfs_buf_wait_unpin(bp);
1285 }
1286
1287 xfs_buf_hold(bp); 1263 xfs_buf_hold(bp);
1288 1264
1289 /* Set the count to 1 initially, this will stop an I/O 1265 /* Set the count to 1 initially, this will stop an I/O
@@ -1481,9 +1457,13 @@ xfs_setsize_buftarg_flags(
1481 btp->bt_smask = sectorsize - 1; 1457 btp->bt_smask = sectorsize - 1;
1482 1458
1483 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1459 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1460 char name[BDEVNAME_SIZE];
1461
1462 bdevname(btp->bt_bdev, name);
1463
1484 xfs_warn(btp->bt_mount, 1464 xfs_warn(btp->bt_mount,
1485 "Cannot set_blocksize to %u on device %s\n", 1465 "Cannot set_blocksize to %u on device %s\n",
1486 sectorsize, xfs_buf_target_name(btp)); 1466 sectorsize, name);
1487 return EINVAL; 1467 return EINVAL;
1488 } 1468 }
1489 1469
@@ -1514,12 +1494,12 @@ xfs_setsize_buftarg(
1514} 1494}
1515 1495
1516STATIC int 1496STATIC int
1517xfs_alloc_delwrite_queue( 1497xfs_alloc_delwri_queue(
1518 xfs_buftarg_t *btp, 1498 xfs_buftarg_t *btp,
1519 const char *fsname) 1499 const char *fsname)
1520{ 1500{
1521 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1501 INIT_LIST_HEAD(&btp->bt_delwri_queue);
1522 spin_lock_init(&btp->bt_delwrite_lock); 1502 spin_lock_init(&btp->bt_delwri_lock);
1523 btp->bt_flags = 0; 1503 btp->bt_flags = 0;
1524 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1504 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1525 if (IS_ERR(btp->bt_task)) 1505 if (IS_ERR(btp->bt_task))
@@ -1549,7 +1529,7 @@ xfs_alloc_buftarg(
1549 spin_lock_init(&btp->bt_lru_lock); 1529 spin_lock_init(&btp->bt_lru_lock);
1550 if (xfs_setsize_buftarg_early(btp, bdev)) 1530 if (xfs_setsize_buftarg_early(btp, bdev))
1551 goto error; 1531 goto error;
1552 if (xfs_alloc_delwrite_queue(btp, fsname)) 1532 if (xfs_alloc_delwri_queue(btp, fsname))
1553 goto error; 1533 goto error;
1554 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1534 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1555 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1535 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1565,56 +1545,48 @@ error:
1565/* 1545/*
1566 * Delayed write buffer handling 1546 * Delayed write buffer handling
1567 */ 1547 */
1568STATIC void 1548void
1569xfs_buf_delwri_queue( 1549xfs_buf_delwri_queue(
1570 xfs_buf_t *bp, 1550 xfs_buf_t *bp)
1571 int unlock)
1572{ 1551{
1573 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1552 struct xfs_buftarg *btp = bp->b_target;
1574 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1575 1553
1576 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1554 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1577 1555
1578 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1556 ASSERT(!(bp->b_flags & XBF_READ));
1579 1557
1580 spin_lock(dwlk); 1558 spin_lock(&btp->bt_delwri_lock);
1581 /* If already in the queue, dequeue and place at tail */
1582 if (!list_empty(&bp->b_list)) { 1559 if (!list_empty(&bp->b_list)) {
1560 /* if already in the queue, move it to the tail */
1583 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1561 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1584 if (unlock) 1562 list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
1585 atomic_dec(&bp->b_hold); 1563 } else {
1586 list_del(&bp->b_list);
1587 }
1588
1589 if (list_empty(dwq)) {
1590 /* start xfsbufd as it is about to have something to do */ 1564 /* start xfsbufd as it is about to have something to do */
1591 wake_up_process(bp->b_target->bt_task); 1565 if (list_empty(&btp->bt_delwri_queue))
1592 } 1566 wake_up_process(bp->b_target->bt_task);
1593 1567
1594 bp->b_flags |= _XBF_DELWRI_Q; 1568 atomic_inc(&bp->b_hold);
1595 list_add_tail(&bp->b_list, dwq); 1569 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1570 list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1571 }
1596 bp->b_queuetime = jiffies; 1572 bp->b_queuetime = jiffies;
1597 spin_unlock(dwlk); 1573 spin_unlock(&btp->bt_delwri_lock);
1598
1599 if (unlock)
1600 xfs_buf_unlock(bp);
1601} 1574}
1602 1575
1603void 1576void
1604xfs_buf_delwri_dequeue( 1577xfs_buf_delwri_dequeue(
1605 xfs_buf_t *bp) 1578 xfs_buf_t *bp)
1606{ 1579{
1607 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1608 int dequeued = 0; 1580 int dequeued = 0;
1609 1581
1610 spin_lock(dwlk); 1582 spin_lock(&bp->b_target->bt_delwri_lock);
1611 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1583 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1612 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1584 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1613 list_del_init(&bp->b_list); 1585 list_del_init(&bp->b_list);
1614 dequeued = 1; 1586 dequeued = 1;
1615 } 1587 }
1616 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1588 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1617 spin_unlock(dwlk); 1589 spin_unlock(&bp->b_target->bt_delwri_lock);
1618 1590
1619 if (dequeued) 1591 if (dequeued)
1620 xfs_buf_rele(bp); 1592 xfs_buf_rele(bp);
@@ -1646,16 +1618,9 @@ xfs_buf_delwri_promote(
1646 if (bp->b_queuetime < jiffies - age) 1618 if (bp->b_queuetime < jiffies - age)
1647 return; 1619 return;
1648 bp->b_queuetime = jiffies - age; 1620 bp->b_queuetime = jiffies - age;
1649 spin_lock(&btp->bt_delwrite_lock); 1621 spin_lock(&btp->bt_delwri_lock);
1650 list_move(&bp->b_list, &btp->bt_delwrite_queue); 1622 list_move(&bp->b_list, &btp->bt_delwri_queue);
1651 spin_unlock(&btp->bt_delwrite_lock); 1623 spin_unlock(&btp->bt_delwri_lock);
1652}
1653
1654STATIC void
1655xfs_buf_runall_queues(
1656 struct workqueue_struct *queue)
1657{
1658 flush_workqueue(queue);
1659} 1624}
1660 1625
1661/* 1626/*
@@ -1669,15 +1634,13 @@ xfs_buf_delwri_split(
1669 unsigned long age) 1634 unsigned long age)
1670{ 1635{
1671 xfs_buf_t *bp, *n; 1636 xfs_buf_t *bp, *n;
1672 struct list_head *dwq = &target->bt_delwrite_queue;
1673 spinlock_t *dwlk = &target->bt_delwrite_lock;
1674 int skipped = 0; 1637 int skipped = 0;
1675 int force; 1638 int force;
1676 1639
1677 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1640 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1678 INIT_LIST_HEAD(list); 1641 INIT_LIST_HEAD(list);
1679 spin_lock(dwlk); 1642 spin_lock(&target->bt_delwri_lock);
1680 list_for_each_entry_safe(bp, n, dwq, b_list) { 1643 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1681 ASSERT(bp->b_flags & XBF_DELWRI); 1644 ASSERT(bp->b_flags & XBF_DELWRI);
1682 1645
1683 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { 1646 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
@@ -1694,10 +1657,9 @@ xfs_buf_delwri_split(
1694 } else 1657 } else
1695 skipped++; 1658 skipped++;
1696 } 1659 }
1697 spin_unlock(dwlk);
1698 1660
1661 spin_unlock(&target->bt_delwri_lock);
1699 return skipped; 1662 return skipped;
1700
1701} 1663}
1702 1664
1703/* 1665/*
@@ -1747,7 +1709,7 @@ xfsbufd(
1747 } 1709 }
1748 1710
1749 /* sleep for a long time if there is nothing to do. */ 1711 /* sleep for a long time if there is nothing to do. */
1750 if (list_empty(&target->bt_delwrite_queue)) 1712 if (list_empty(&target->bt_delwri_queue))
1751 tout = MAX_SCHEDULE_TIMEOUT; 1713 tout = MAX_SCHEDULE_TIMEOUT;
1752 schedule_timeout_interruptible(tout); 1714 schedule_timeout_interruptible(tout);
1753 1715
@@ -1783,9 +1745,7 @@ xfs_flush_buftarg(
1783 LIST_HEAD(wait_list); 1745 LIST_HEAD(wait_list);
1784 struct blk_plug plug; 1746 struct blk_plug plug;
1785 1747
1786 xfs_buf_runall_queues(xfsconvertd_workqueue); 1748 flush_workqueue(xfslogd_workqueue);
1787 xfs_buf_runall_queues(xfsdatad_workqueue);
1788 xfs_buf_runall_queues(xfslogd_workqueue);
1789 1749
1790 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1750 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1791 pincount = xfs_buf_delwri_split(target, &tmp_list, 0); 1751 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
@@ -1866,11 +1826,3 @@ xfs_buf_terminate(void)
1866 destroy_workqueue(xfslogd_workqueue); 1826 destroy_workqueue(xfslogd_workqueue);
1867 kmem_zone_destroy(xfs_buf_zone); 1827 kmem_zone_destroy(xfs_buf_zone);
1868} 1828}
1869
1870#ifdef CONFIG_KDB_MODULES
1871struct list_head *
1872xfs_get_buftarg_list(void)
1873{
1874 return &xfs_buftarg_list;
1875}
1876#endif
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 620972b8094..5bab046e859 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
105 105
106 /* per device delwri queue */ 106 /* per device delwri queue */
107 struct task_struct *bt_task; 107 struct task_struct *bt_task;
108 struct list_head bt_delwrite_queue; 108 struct list_head bt_delwri_queue;
109 spinlock_t bt_delwrite_lock; 109 spinlock_t bt_delwri_lock;
110 unsigned long bt_flags; 110 unsigned long bt_flags;
111 111
112 /* LRU control structures */ 112 /* LRU control structures */
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
175extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, 175extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
176 xfs_buf_flags_t); 176 xfs_buf_flags_t);
177 177
178extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 178struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
179 xfs_buf_flags_t);
179extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); 180extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
180extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); 181extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
181extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 182extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
@@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *);
197 ((bp)->b_sema.count <= 0) 198 ((bp)->b_sema.count <= 0)
198 199
199/* Buffer Read and Write Routines */ 200/* Buffer Read and Write Routines */
200extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 201extern int xfs_bwrite(struct xfs_buf *bp);
201extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
202 202
203extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); 203extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
204extern int xfs_bdstrat_cb(struct xfs_buf *); 204extern int xfs_bdstrat_cb(struct xfs_buf *);
205 205
206extern void xfs_buf_ioend(xfs_buf_t *, int); 206extern void xfs_buf_ioend(xfs_buf_t *, int);
207extern void xfs_buf_ioerror(xfs_buf_t *, int); 207extern void xfs_buf_ioerror(xfs_buf_t *, int);
208extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
208extern int xfs_buf_iorequest(xfs_buf_t *); 209extern int xfs_buf_iorequest(xfs_buf_t *);
209extern int xfs_buf_iowait(xfs_buf_t *); 210extern int xfs_buf_iowait(xfs_buf_t *);
210extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 211extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
@@ -221,38 +222,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
221extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 222extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
222 223
223/* Delayed Write Buffer Routines */ 224/* Delayed Write Buffer Routines */
224extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 225extern void xfs_buf_delwri_queue(struct xfs_buf *);
225extern void xfs_buf_delwri_promote(xfs_buf_t *); 226extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
227extern void xfs_buf_delwri_promote(struct xfs_buf *);
226 228
227/* Buffer Daemon Setup Routines */ 229/* Buffer Daemon Setup Routines */
228extern int xfs_buf_init(void); 230extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 231extern void xfs_buf_terminate(void);
230 232
231static inline const char *
232xfs_buf_target_name(struct xfs_buftarg *target)
233{
234 static char __b[BDEVNAME_SIZE];
235
236 return bdevname(target->bt_bdev, __b);
237}
238
239
240#define XFS_BUF_ZEROFLAGS(bp) \ 233#define XFS_BUF_ZEROFLAGS(bp) \
241 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 234 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
242 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 235 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
243 236
244void xfs_buf_stale(struct xfs_buf *bp); 237void xfs_buf_stale(struct xfs_buf *bp);
245#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
246#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 238#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
247#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 239#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
248#define XFS_BUF_SUPER_STALE(bp) do { \ 240
249 XFS_BUF_STALE(bp); \
250 xfs_buf_delwri_dequeue(bp); \
251 XFS_BUF_DONE(bp); \
252 } while (0)
253
254#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
255#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
256#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 241#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
257 242
258#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 243#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
@@ -280,23 +265,16 @@ void xfs_buf_stale(struct xfs_buf *bp);
280#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 265#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
281#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 266#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
282 267
283static inline void 268static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
284xfs_buf_set_ref(
285 struct xfs_buf *bp,
286 int lru_ref)
287{ 269{
288 atomic_set(&bp->b_lru_ref, lru_ref); 270 atomic_set(&bp->b_lru_ref, lru_ref);
289} 271}
290#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
291#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
292 272
293static inline int xfs_buf_ispinned(struct xfs_buf *bp) 273static inline int xfs_buf_ispinned(struct xfs_buf *bp)
294{ 274{
295 return atomic_read(&bp->b_pin_count); 275 return atomic_read(&bp->b_pin_count);
296} 276}
297 277
298#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
299
300static inline void xfs_buf_relse(xfs_buf_t *bp) 278static inline void xfs_buf_relse(xfs_buf_t *bp)
301{ 279{
302 xfs_buf_unlock(bp); 280 xfs_buf_unlock(bp);
@@ -313,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
313extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 291extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
314extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 292extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
315 293
316#ifdef CONFIG_KDB_MODULES
317extern struct list_head *xfs_get_buftarg_list(void);
318#endif
319
320#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 294#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
321#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 295#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
322 296
323#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
324#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
325
326#endif /* __XFS_BUF_H__ */ 297#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ef43fce519a..eac97ef81e2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
656/* 656/*
657 * This is the ops vector shared by all buf log items. 657 * This is the ops vector shared by all buf log items.
658 */ 658 */
659static struct xfs_item_ops xfs_buf_item_ops = { 659static const struct xfs_item_ops xfs_buf_item_ops = {
660 .iop_size = xfs_buf_item_size, 660 .iop_size = xfs_buf_item_size,
661 .iop_format = xfs_buf_item_format, 661 .iop_format = xfs_buf_item_format,
662 .iop_pin = xfs_buf_item_pin, 662 .iop_pin = xfs_buf_item_pin,
@@ -967,7 +967,8 @@ xfs_buf_iodone_callbacks(
967 * I/O errors, there's no point in giving this a retry. 967 * I/O errors, there's no point in giving this a retry.
968 */ 968 */
969 if (XFS_FORCED_SHUTDOWN(mp)) { 969 if (XFS_FORCED_SHUTDOWN(mp)) {
970 XFS_BUF_SUPER_STALE(bp); 970 xfs_buf_stale(bp);
971 XFS_BUF_DONE(bp);
971 trace_xfs_buf_item_iodone(bp, _RET_IP_); 972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
972 goto do_callbacks; 973 goto do_callbacks;
973 } 974 }
@@ -975,9 +976,7 @@ xfs_buf_iodone_callbacks(
975 if (bp->b_target != lasttarg || 976 if (bp->b_target != lasttarg ||
976 time_after(jiffies, (lasttime + 5*HZ))) { 977 time_after(jiffies, (lasttime + 5*HZ))) {
977 lasttime = jiffies; 978 lasttime = jiffies;
978 xfs_alert(mp, "Device %s: metadata write error block 0x%llx", 979 xfs_buf_ioerror_alert(bp, __func__);
979 xfs_buf_target_name(bp->b_target),
980 (__uint64_t)XFS_BUF_ADDR(bp));
981 } 980 }
982 lasttarg = bp->b_target; 981 lasttarg = bp->b_target;
983 982
@@ -993,7 +992,7 @@ xfs_buf_iodone_callbacks(
993 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 992 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
994 993
995 if (!XFS_BUF_ISSTALE(bp)) { 994 if (!XFS_BUF_ISSTALE(bp)) {
996 XFS_BUF_DELAYWRITE(bp); 995 xfs_buf_delwri_queue(bp);
997 XFS_BUF_DONE(bp); 996 XFS_BUF_DONE(bp);
998 } 997 }
999 ASSERT(bp->b_iodone != NULL); 998 ASSERT(bp->b_iodone != NULL);
@@ -1006,9 +1005,8 @@ xfs_buf_iodone_callbacks(
1006 * If the write of the buffer was synchronous, we want to make 1005 * If the write of the buffer was synchronous, we want to make
1007 * sure to return the error to the caller of xfs_bwrite(). 1006 * sure to return the error to the caller of xfs_bwrite().
1008 */ 1007 */
1009 XFS_BUF_STALE(bp); 1008 xfs_buf_stale(bp);
1010 XFS_BUF_DONE(bp); 1009 XFS_BUF_DONE(bp);
1011 XFS_BUF_UNDELAYWRITE(bp);
1012 1010
1013 trace_xfs_buf_error_relse(bp, _RET_IP_); 1011 trace_xfs_buf_error_relse(bp, _RET_IP_);
1014 1012
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index ee9d5427fcd..77c74257c2a 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
1578 */ 1578 */
1579 nmap = 1; 1579 nmap = 1;
1580 ASSERT(args->firstblock != NULL); 1580 ASSERT(args->firstblock != NULL);
1581 error = xfs_bmapi(tp, dp, *bno, count, 1581 error = xfs_bmapi_write(tp, dp, *bno, count,
1582 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1582 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
1583 XFS_BMAPI_CONTIG,
1584 args->firstblock, args->total, &map, &nmap, 1583 args->firstblock, args->total, &map, &nmap,
1585 args->flist); 1584 args->flist);
1586 if (error) 1585 if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
1602 for (b = *bno, mapi = 0; b < *bno + count; ) { 1601 for (b = *bno, mapi = 0; b < *bno + count; ) {
1603 nmap = MIN(XFS_BMAP_MAX_NMAP, count); 1602 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1604 c = (int)(*bno + count - b); 1603 c = (int)(*bno + count - b);
1605 error = xfs_bmapi(tp, dp, b, c, 1604 error = xfs_bmapi_write(tp, dp, b, c,
1606 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| 1605 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1607 XFS_BMAPI_METADATA,
1608 args->firstblock, args->total, 1606 args->firstblock, args->total,
1609 &mapp[mapi], &nmap, args->flist); 1607 &mapp[mapi], &nmap, args->flist);
1610 if (error) 1608 if (error)
@@ -1975,33 +1973,16 @@ xfs_da_do_buf(
1975 /* 1973 /*
1976 * Optimize the one-block case. 1974 * Optimize the one-block case.
1977 */ 1975 */
1978 if (nfsb == 1) { 1976 if (nfsb == 1)
1979 xfs_fsblock_t fsb;
1980
1981 if ((error =
1982 xfs_bmapi_single(trans, dp, whichfork, &fsb,
1983 (xfs_fileoff_t)bno))) {
1984 return error;
1985 }
1986 mapp = &map; 1977 mapp = &map;
1987 if (fsb == NULLFSBLOCK) { 1978 else
1988 nmap = 0;
1989 } else {
1990 map.br_startblock = fsb;
1991 map.br_startoff = (xfs_fileoff_t)bno;
1992 map.br_blockcount = 1;
1993 nmap = 1;
1994 }
1995 } else {
1996 mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); 1979 mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
1997 nmap = nfsb; 1980
1998 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, 1981 nmap = nfsb;
1999 nfsb, 1982 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
2000 XFS_BMAPI_METADATA | 1983 &nmap, xfs_bmapi_aflag(whichfork));
2001 xfs_bmapi_aflag(whichfork), 1984 if (error)
2002 NULL, 0, mapp, &nmap, NULL))) 1985 goto exit0;
2003 goto exit0;
2004 }
2005 } else { 1986 } else {
2006 map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); 1987 map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2007 map.br_startoff = (xfs_fileoff_t)bno; 1988 map.br_startoff = (xfs_fileoff_t)bno;
@@ -2072,13 +2053,10 @@ xfs_da_do_buf(
2072 if (!bp) 2053 if (!bp)
2073 continue; 2054 continue;
2074 if (caller == 1) { 2055 if (caller == 1) {
2075 if (whichfork == XFS_ATTR_FORK) { 2056 if (whichfork == XFS_ATTR_FORK)
2076 XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, 2057 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2077 XFS_ATTR_BTREE_REF); 2058 else
2078 } else { 2059 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2079 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
2080 XFS_DIR_BTREE_REF);
2081 }
2082 } 2060 }
2083 if (bplist) { 2061 if (bplist) {
2084 bplist[nbplist++] = bp; 2062 bplist[nbplist++] = bp;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b..654dc6f05ba 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
425 } 425 }
426 426
427 427
428 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 428 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
429 xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 429 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
430 430
431 xfs_trans_log_inode(tp, ip, ilf_fields); 431 xfs_trans_log_inode(tp, ip, ilf_fields);
432 xfs_trans_log_inode(tp, tip, tilf_fields); 432 xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -438,7 +438,7 @@ xfs_swap_extents(
438 if (mp->m_flags & XFS_MOUNT_WSYNC) 438 if (mp->m_flags & XFS_MOUNT_WSYNC)
439 xfs_trans_set_sync(tp); 439 xfs_trans_set_sync(tp);
440 440
441 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 441 error = xfs_trans_commit(tp, 0);
442 442
443 trace_xfs_swap_extent_after(ip, 0); 443 trace_xfs_swap_extent_after(ip, 0);
444 trace_xfs_swap_extent_after(tip, 1); 444 trace_xfs_swap_extent_after(tip, 1);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cd..66e108f561a 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
888 * we already have in the table. 888 * we already have in the table.
889 */ 889 */
890 nmap = map_size - map_valid; 890 nmap = map_size - map_valid;
891 error = xfs_bmapi(NULL, dp, 891 error = xfs_bmapi_read(dp, map_off,
892 map_off,
893 xfs_dir2_byte_to_da(mp, 892 xfs_dir2_byte_to_da(mp,
894 XFS_DIR2_LEAF_OFFSET) - map_off, 893 XFS_DIR2_LEAF_OFFSET) - map_off,
895 XFS_BMAPI_METADATA, NULL, 0, 894 &map[map_valid], &nmap, 0);
896 &map[map_valid], &nmap, NULL);
897 /* 895 /*
898 * Don't know if we should ignore this or 896 * Don't know if we should ignore this or
899 * try to return an error. 897 * try to return an error.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae3..8a24f0c6c86 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
38 struct xfs_mount *mp, 38 struct xfs_mount *mp,
39 xfs_agnumber_t agno, 39 xfs_agnumber_t agno,
40 xfs_fsblock_t start, 40 xfs_fsblock_t start,
41 xfs_fsblock_t len, 41 xfs_fsblock_t end,
42 xfs_fsblock_t minlen, 42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed) 43 __uint64_t *blocks_trimmed)
44{ 44{
@@ -100,7 +100,7 @@ xfs_trim_extents(
100 * down partially overlapping ranges for now. 100 * down partially overlapping ranges for now.
101 */ 101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || 102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { 103 XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen); 104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent; 105 goto next_extent;
106 } 106 }
@@ -145,7 +145,7 @@ xfs_ioc_trim(
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; 145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity; 146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range; 147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen; 148 xfs_fsblock_t start, end, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno; 149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0; 150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0; 151 int error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
165 * matter as trimming blocks is an advisory interface. 165 * matter as trimming blocks is an advisory interface.
166 */ 166 */
167 start = XFS_B_TO_FSBT(mp, range.start); 167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len); 168 end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); 169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170 170
171 start_agno = XFS_FSB_TO_AGNO(mp, start); 171 if (start >= mp->m_sb.sb_dblocks)
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL); 172 return -XFS_ERROR(EINVAL);
173 if (end > mp->m_sb.sb_dblocks - 1)
174 end = mp->m_sb.sb_dblocks - 1;
174 175
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len); 176 start_agno = XFS_FSB_TO_AGNO(mp, start);
176 if (end_agno >= mp->m_sb.sb_agcount) 177 end_agno = XFS_FSB_TO_AGNO(mp, end);
177 end_agno = mp->m_sb.sb_agcount - 1;
178 178
179 for (agno = start_agno; agno <= end_agno; agno++) { 179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen, 180 error = -xfs_trim_extents(mp, agno, start, end, minlen,
181 &blocks_trimmed); 181 &blocks_trimmed);
182 if (error) 182 if (error)
183 last_error = error; 183 last_error = error;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index db62959bed1..25d7280e9f6 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -377,16 +377,14 @@ xfs_qm_dqalloc(
377 return (ESRCH); 377 return (ESRCH);
378 } 378 }
379 379
380 xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); 380 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
381 nmaps = 1; 381 nmaps = 1;
382 if ((error = xfs_bmapi(tp, quotip, 382 error = xfs_bmapi_write(tp, quotip, offset_fsb,
383 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, 383 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
384 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, 384 &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
385 &firstblock, 385 &map, &nmaps, &flist);
386 XFS_QM_DQALLOC_SPACE_RES(mp), 386 if (error)
387 &map, &nmaps, &flist))) {
388 goto error0; 387 goto error0;
389 }
390 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); 388 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
391 ASSERT(nmaps == 1); 389 ASSERT(nmaps == 1);
392 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 390 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -402,8 +400,11 @@ xfs_qm_dqalloc(
402 dqp->q_blkno, 400 dqp->q_blkno,
403 mp->m_quotainfo->qi_dqchunklen, 401 mp->m_quotainfo->qi_dqchunklen,
404 0); 402 0);
405 if (!bp || (error = xfs_buf_geterror(bp))) 403
404 error = xfs_buf_geterror(bp);
405 if (error)
406 goto error1; 406 goto error1;
407
407 /* 408 /*
408 * Make a chunk of dquots out of this buffer and log 409 * Make a chunk of dquots out of this buffer and log
409 * the entire thing. 410 * the entire thing.
@@ -485,9 +486,8 @@ xfs_qm_dqtobp(
485 /* 486 /*
486 * Find the block map; no allocations yet 487 * Find the block map; no allocations yet
487 */ 488 */
488 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 489 error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
489 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 490 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
490 NULL, 0, &map, &nmaps, NULL);
491 491
492 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 492 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
493 if (error) 493 if (error)
@@ -605,7 +605,7 @@ xfs_qm_dqread(
605 dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); 605 dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
606 606
607 /* Mark the buf so that this will stay incore a little longer */ 607 /* Mark the buf so that this will stay incore a little longer */
608 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); 608 xfs_buf_set_ref(bp, XFS_DQUOT_REF);
609 609
610 /* 610 /*
611 * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) 611 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
@@ -1242,9 +1242,11 @@ xfs_qm_dqflush(
1242 } 1242 }
1243 1243
1244 if (flags & SYNC_WAIT) 1244 if (flags & SYNC_WAIT)
1245 error = xfs_bwrite(mp, bp); 1245 error = xfs_bwrite(bp);
1246 else 1246 else
1247 xfs_bdwrite(mp, bp); 1247 xfs_buf_delwri_queue(bp);
1248
1249 xfs_buf_relse(bp);
1248 1250
1249 trace_xfs_dqflush_done(dqp); 1251 trace_xfs_dqflush_done(dqp);
1250 1252
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d..0dee0b71029 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
295/* 295/*
296 * This is the ops vector for dquots 296 * This is the ops vector for dquots
297 */ 297 */
298static struct xfs_item_ops xfs_dquot_item_ops = { 298static const struct xfs_item_ops xfs_dquot_item_ops = {
299 .iop_size = xfs_qm_dquot_logitem_size, 299 .iop_size = xfs_qm_dquot_logitem_size,
300 .iop_format = xfs_qm_dquot_logitem_format, 300 .iop_format = xfs_qm_dquot_logitem_format,
301 .iop_pin = xfs_qm_dquot_logitem_pin, 301 .iop_pin = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
483{ 483{
484} 484}
485 485
486static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { 486static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
487 .iop_size = xfs_qm_qoff_logitem_size, 487 .iop_size = xfs_qm_qoff_logitem_size,
488 .iop_format = xfs_qm_qoff_logitem_format, 488 .iop_format = xfs_qm_qoff_logitem_format,
489 .iop_pin = xfs_qm_qoff_logitem_pin, 489 .iop_pin = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
498/* 498/*
499 * This is the ops vector shared by all quotaoff-start log items. 499 * This is the ops vector shared by all quotaoff-start log items.
500 */ 500 */
501static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { 501static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
502 .iop_size = xfs_qm_qoff_logitem_size, 502 .iop_size = xfs_qm_qoff_logitem_size,
503 .iop_format = xfs_qm_qoff_logitem_format, 503 .iop_format = xfs_qm_qoff_logitem_format,
504 .iop_pin = xfs_qm_qoff_logitem_pin, 504 .iop_pin = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48..da108977b21 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
229{ 229{
230 struct xfs_inode *ip = XFS_I(inode); 230 struct xfs_inode *ip = XFS_I(inode);
231 struct xfs_mount *mp = ip->i_mount; 231 struct xfs_mount *mp = ip->i_mount;
232 int error = 0; 232 xfs_lsn_t lsn = 0;
233 233
234 xfs_ilock(ip, XFS_ILOCK_SHARED); 234 xfs_ilock(ip, XFS_ILOCK_SHARED);
235 if (xfs_ipincount(ip)) { 235 if (xfs_ipincount(ip))
236 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, 236 lsn = ip->i_itemp->ili_last_lsn;
237 XFS_LOG_SYNC, NULL);
238 }
239 xfs_iunlock(ip, XFS_ILOCK_SHARED); 237 xfs_iunlock(ip, XFS_ILOCK_SHARED);
240 238
241 return error; 239 if (!lsn)
240 return 0;
241 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
242} 242}
243 243
244const struct export_operations xfs_export_operations = { 244const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e6262343..35c2aff38b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
217/* 217/*
218 * This is the ops vector shared by all efi log items. 218 * This is the ops vector shared by all efi log items.
219 */ 219 */
220static struct xfs_item_ops xfs_efi_item_ops = { 220static const struct xfs_item_ops xfs_efi_item_ops = {
221 .iop_size = xfs_efi_item_size, 221 .iop_size = xfs_efi_item_size,
222 .iop_format = xfs_efi_item_format, 222 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 223 .iop_pin = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
477/* 477/*
478 * This is the ops vector shared by all efd log items. 478 * This is the ops vector shared by all efd log items.
479 */ 479 */
480static struct xfs_item_ops xfs_efd_item_ops = { 480static const struct xfs_item_ops xfs_efd_item_ops = {
481 .iop_size = xfs_efd_item_size, 481 .iop_size = xfs_efd_item_size,
482 .iop_format = xfs_efd_item_format, 482 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 483 .iop_pin = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea..753ed9b5c70 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
124 return (-status); 124 return (-status);
125} 125}
126 126
127/*
128 * Fsync operations on directories are much simpler than on regular files,
129 * as there is no file data to flush, and thus also no need for explicit
130 * cache flush operations, and there are no non-transaction metadata updates
131 * on directories either.
132 */
133STATIC int
134xfs_dir_fsync(
135 struct file *file,
136 loff_t start,
137 loff_t end,
138 int datasync)
139{
140 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
141 struct xfs_mount *mp = ip->i_mount;
142 xfs_lsn_t lsn = 0;
143
144 trace_xfs_dir_fsync(ip);
145
146 xfs_ilock(ip, XFS_ILOCK_SHARED);
147 if (xfs_ipincount(ip))
148 lsn = ip->i_itemp->ili_last_lsn;
149 xfs_iunlock(ip, XFS_ILOCK_SHARED);
150
151 if (!lsn)
152 return 0;
153 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
154}
155
127STATIC int 156STATIC int
128xfs_file_fsync( 157xfs_file_fsync(
129 struct file *file, 158 struct file *file,
@@ -137,6 +166,7 @@ xfs_file_fsync(
137 struct xfs_trans *tp; 166 struct xfs_trans *tp;
138 int error = 0; 167 int error = 0;
139 int log_flushed = 0; 168 int log_flushed = 0;
169 xfs_lsn_t lsn = 0;
140 170
141 trace_xfs_file_fsync(ip); 171 trace_xfs_file_fsync(ip);
142 172
@@ -149,10 +179,6 @@ xfs_file_fsync(
149 179
150 xfs_iflags_clear(ip, XFS_ITRUNCATED); 180 xfs_iflags_clear(ip, XFS_ITRUNCATED);
151 181
152 xfs_ilock(ip, XFS_IOLOCK_SHARED);
153 xfs_ioend_wait(ip);
154 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
155
156 if (mp->m_flags & XFS_MOUNT_BARRIER) { 182 if (mp->m_flags & XFS_MOUNT_BARRIER) {
157 /* 183 /*
158 * If we have an RT and/or log subvolume we need to make sure 184 * If we have an RT and/or log subvolume we need to make sure
@@ -216,11 +242,11 @@ xfs_file_fsync(
216 * transaction. So we play it safe and fire off the 242 * transaction. So we play it safe and fire off the
217 * transaction anyway. 243 * transaction anyway.
218 */ 244 */
219 xfs_trans_ijoin(tp, ip); 245 xfs_trans_ijoin(tp, ip, 0);
220 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 246 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
221 xfs_trans_set_sync(tp); 247 error = xfs_trans_commit(tp, 0);
222 error = _xfs_trans_commit(tp, 0, &log_flushed);
223 248
249 lsn = ip->i_itemp->ili_last_lsn;
224 xfs_iunlock(ip, XFS_ILOCK_EXCL); 250 xfs_iunlock(ip, XFS_ILOCK_EXCL);
225 } else { 251 } else {
226 /* 252 /*
@@ -231,14 +257,14 @@ xfs_file_fsync(
231 * disk yet, the inode will be still be pinned. If it is, 257 * disk yet, the inode will be still be pinned. If it is,
232 * force the log. 258 * force the log.
233 */ 259 */
234 if (xfs_ipincount(ip)) { 260 if (xfs_ipincount(ip))
235 error = _xfs_log_force_lsn(mp, 261 lsn = ip->i_itemp->ili_last_lsn;
236 ip->i_itemp->ili_last_lsn,
237 XFS_LOG_SYNC, &log_flushed);
238 }
239 xfs_iunlock(ip, XFS_ILOCK_SHARED); 262 xfs_iunlock(ip, XFS_ILOCK_SHARED);
240 } 263 }
241 264
265 if (!error && lsn)
266 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
267
242 /* 268 /*
243 * If we only have a single device, and the log force about was 269 * If we only have a single device, and the log force about was
244 * a no-op we might have to flush the data device cache here. 270 * a no-op we might have to flush the data device cache here.
@@ -317,7 +343,19 @@ xfs_file_aio_read(
317 if (XFS_FORCED_SHUTDOWN(mp)) 343 if (XFS_FORCED_SHUTDOWN(mp))
318 return -EIO; 344 return -EIO;
319 345
320 if (unlikely(ioflags & IO_ISDIRECT)) { 346 /*
347 * Locking is a bit tricky here. If we take an exclusive lock
348 * for direct IO, we effectively serialise all new concurrent
349 * read IO to this file and block it behind IO that is currently in
350 * progress because IO in progress holds the IO lock shared. We only
351 * need to hold the lock exclusive to blow away the page cache, so
352 * only take lock exclusively if the page cache needs invalidation.
353 * This allows the normal direct IO case of no page cache pages to
354 * proceeed concurrently without serialisation.
355 */
356 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
357 if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
358 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 359 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
322 360
323 if (inode->i_mapping->nrpages) { 361 if (inode->i_mapping->nrpages) {
@@ -330,8 +368,7 @@ xfs_file_aio_read(
330 } 368 }
331 } 369 }
332 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 370 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
333 } else 371 }
334 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
335 372
336 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 373 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
337 374
@@ -407,11 +444,13 @@ xfs_aio_write_isize_update(
407 */ 444 */
408STATIC void 445STATIC void
409xfs_aio_write_newsize_update( 446xfs_aio_write_newsize_update(
410 struct xfs_inode *ip) 447 struct xfs_inode *ip,
448 xfs_fsize_t new_size)
411{ 449{
412 if (ip->i_new_size) { 450 if (new_size == ip->i_new_size) {
413 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 451 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
414 ip->i_new_size = 0; 452 if (new_size == ip->i_new_size)
453 ip->i_new_size = 0;
415 if (ip->i_d.di_size > ip->i_size) 454 if (ip->i_d.di_size > ip->i_size)
416 ip->i_d.di_size = ip->i_size; 455 ip->i_d.di_size = ip->i_size;
417 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 456 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +501,7 @@ xfs_file_splice_write(
462 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 501 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
463 502
464 xfs_aio_write_isize_update(inode, ppos, ret); 503 xfs_aio_write_isize_update(inode, ppos, ret);
465 xfs_aio_write_newsize_update(ip); 504 xfs_aio_write_newsize_update(ip, new_size);
466 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 505 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
467 return ret; 506 return ret;
468} 507}
@@ -500,11 +539,9 @@ xfs_zero_last_block(
500 539
501 last_fsb = XFS_B_TO_FSBT(mp, isize); 540 last_fsb = XFS_B_TO_FSBT(mp, isize);
502 nimaps = 1; 541 nimaps = 1;
503 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 542 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
504 &nimaps, NULL); 543 if (error)
505 if (error) {
506 return error; 544 return error;
507 }
508 ASSERT(nimaps > 0); 545 ASSERT(nimaps > 0);
509 /* 546 /*
510 * If the block underlying isize is just a hole, then there 547 * If the block underlying isize is just a hole, then there
@@ -595,8 +632,8 @@ xfs_zero_eof(
595 while (start_zero_fsb <= end_zero_fsb) { 632 while (start_zero_fsb <= end_zero_fsb) {
596 nimaps = 1; 633 nimaps = 1;
597 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 634 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
598 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 635 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
599 0, NULL, 0, &imap, &nimaps, NULL); 636 &imap, &nimaps, 0);
600 if (error) { 637 if (error) {
601 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 638 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
602 return error; 639 return error;
@@ -659,6 +696,7 @@ xfs_file_aio_write_checks(
659 struct file *file, 696 struct file *file,
660 loff_t *pos, 697 loff_t *pos,
661 size_t *count, 698 size_t *count,
699 xfs_fsize_t *new_sizep,
662 int *iolock) 700 int *iolock)
663{ 701{
664 struct inode *inode = file->f_mapping->host; 702 struct inode *inode = file->f_mapping->host;
@@ -666,6 +704,9 @@ xfs_file_aio_write_checks(
666 xfs_fsize_t new_size; 704 xfs_fsize_t new_size;
667 int error = 0; 705 int error = 0;
668 706
707 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
708 *new_sizep = 0;
709restart:
669 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 710 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
670 if (error) { 711 if (error) {
671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 712 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +714,41 @@ xfs_file_aio_write_checks(
673 return error; 714 return error;
674 } 715 }
675 716
676 new_size = *pos + *count;
677 if (new_size > ip->i_size)
678 ip->i_new_size = new_size;
679
680 if (likely(!(file->f_mode & FMODE_NOCMTIME))) 717 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
681 file_update_time(file); 718 file_update_time(file);
682 719
683 /* 720 /*
684 * If the offset is beyond the size of the file, we need to zero any 721 * If the offset is beyond the size of the file, we need to zero any
685 * blocks that fall between the existing EOF and the start of this 722 * blocks that fall between the existing EOF and the start of this
686 * write. 723 * write. There is no need to issue zeroing if another in-flght IO ends
724 * at or before this one If zeronig is needed and we are currently
725 * holding the iolock shared, we need to update it to exclusive which
726 * involves dropping all locks and relocking to maintain correct locking
727 * order. If we do this, restart the function to ensure all checks and
728 * values are still valid.
687 */ 729 */
688 if (*pos > ip->i_size) 730 if ((ip->i_new_size && *pos > ip->i_new_size) ||
731 (!ip->i_new_size && *pos > ip->i_size)) {
732 if (*iolock == XFS_IOLOCK_SHARED) {
733 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
734 *iolock = XFS_IOLOCK_EXCL;
735 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
736 goto restart;
737 }
689 error = -xfs_zero_eof(ip, *pos, ip->i_size); 738 error = -xfs_zero_eof(ip, *pos, ip->i_size);
739 }
740
741 /*
742 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
743 * We have already zeroed space beyond EOF (if necessary). Only update
744 * ip->i_new_size if this IO ends beyond any other in-flight writes.
745 */
746 new_size = *pos + *count;
747 if (new_size > ip->i_size) {
748 if (new_size > ip->i_new_size)
749 ip->i_new_size = new_size;
750 *new_sizep = new_size;
751 }
690 752
691 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 753 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
692 if (error) 754 if (error)
@@ -721,7 +783,7 @@ xfs_file_aio_write_checks(
721 * the dio layer. To avoid the problem with aio, we also need to wait for 783 * the dio layer. To avoid the problem with aio, we also need to wait for
722 * outstanding IOs to complete so that unwritten extent conversion is completed 784 * outstanding IOs to complete so that unwritten extent conversion is completed
723 * before we try to map the overlapping block. This is currently implemented by 785 * before we try to map the overlapping block. This is currently implemented by
724 * hitting it with a big hammer (i.e. xfs_ioend_wait()). 786 * hitting it with a big hammer (i.e. inode_dio_wait()).
725 * 787 *
726 * Returns with locks held indicated by @iolock and errors indicated by 788 * Returns with locks held indicated by @iolock and errors indicated by
727 * negative return values. 789 * negative return values.
@@ -733,6 +795,7 @@ xfs_file_dio_aio_write(
733 unsigned long nr_segs, 795 unsigned long nr_segs,
734 loff_t pos, 796 loff_t pos,
735 size_t ocount, 797 size_t ocount,
798 xfs_fsize_t *new_size,
736 int *iolock) 799 int *iolock)
737{ 800{
738 struct file *file = iocb->ki_filp; 801 struct file *file = iocb->ki_filp;
@@ -753,18 +816,35 @@ xfs_file_dio_aio_write(
753 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) 816 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
754 unaligned_io = 1; 817 unaligned_io = 1;
755 818
756 if (unaligned_io || mapping->nrpages || pos > ip->i_size) 819 /*
820 * We don't need to take an exclusive lock unless there page cache needs
821 * to be invalidated or unaligned IO is being executed. We don't need to
822 * consider the EOF extension case here because
823 * xfs_file_aio_write_checks() will relock the inode as necessary for
824 * EOF zeroing cases and fill out the new inode size as appropriate.
825 */
826 if (unaligned_io || mapping->nrpages)
757 *iolock = XFS_IOLOCK_EXCL; 827 *iolock = XFS_IOLOCK_EXCL;
758 else 828 else
759 *iolock = XFS_IOLOCK_SHARED; 829 *iolock = XFS_IOLOCK_SHARED;
760 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 830 xfs_rw_ilock(ip, *iolock);
761 831
762 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 832 /*
833 * Recheck if there are cached pages that need invalidate after we got
834 * the iolock to protect against other threads adding new pages while
835 * we were waiting for the iolock.
836 */
837 if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
838 xfs_rw_iunlock(ip, *iolock);
839 *iolock = XFS_IOLOCK_EXCL;
840 xfs_rw_ilock(ip, *iolock);
841 }
842
843 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
763 if (ret) 844 if (ret)
764 return ret; 845 return ret;
765 846
766 if (mapping->nrpages) { 847 if (mapping->nrpages) {
767 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
768 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 848 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
769 FI_REMAPF_LOCKED); 849 FI_REMAPF_LOCKED);
770 if (ret) 850 if (ret)
@@ -776,7 +856,7 @@ xfs_file_dio_aio_write(
776 * otherwise demote the lock if we had to flush cached pages 856 * otherwise demote the lock if we had to flush cached pages
777 */ 857 */
778 if (unaligned_io) 858 if (unaligned_io)
779 xfs_ioend_wait(ip); 859 inode_dio_wait(inode);
780 else if (*iolock == XFS_IOLOCK_EXCL) { 860 else if (*iolock == XFS_IOLOCK_EXCL) {
781 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 861 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
782 *iolock = XFS_IOLOCK_SHARED; 862 *iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +878,7 @@ xfs_file_buffered_aio_write(
798 unsigned long nr_segs, 878 unsigned long nr_segs,
799 loff_t pos, 879 loff_t pos,
800 size_t ocount, 880 size_t ocount,
881 xfs_fsize_t *new_size,
801 int *iolock) 882 int *iolock)
802{ 883{
803 struct file *file = iocb->ki_filp; 884 struct file *file = iocb->ki_filp;
@@ -809,9 +890,9 @@ xfs_file_buffered_aio_write(
809 size_t count = ocount; 890 size_t count = ocount;
810 891
811 *iolock = XFS_IOLOCK_EXCL; 892 *iolock = XFS_IOLOCK_EXCL;
812 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 893 xfs_rw_ilock(ip, *iolock);
813 894
814 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 895 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
815 if (ret) 896 if (ret)
816 return ret; 897 return ret;
817 898
@@ -851,6 +932,7 @@ xfs_file_aio_write(
851 ssize_t ret; 932 ssize_t ret;
852 int iolock; 933 int iolock;
853 size_t ocount = 0; 934 size_t ocount = 0;
935 xfs_fsize_t new_size = 0;
854 936
855 XFS_STATS_INC(xs_write_calls); 937 XFS_STATS_INC(xs_write_calls);
856 938
@@ -870,10 +952,10 @@ xfs_file_aio_write(
870 952
871 if (unlikely(file->f_flags & O_DIRECT)) 953 if (unlikely(file->f_flags & O_DIRECT))
872 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 954 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
873 ocount, &iolock); 955 ocount, &new_size, &iolock);
874 else 956 else
875 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 957 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
876 ocount, &iolock); 958 ocount, &new_size, &iolock);
877 959
878 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); 960 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
879 961
@@ -894,7 +976,7 @@ xfs_file_aio_write(
894 } 976 }
895 977
896out_unlock: 978out_unlock:
897 xfs_aio_write_newsize_update(ip); 979 xfs_aio_write_newsize_update(ip, new_size);
898 xfs_rw_iunlock(ip, iolock); 980 xfs_rw_iunlock(ip, iolock);
899 return ret; 981 return ret;
900} 982}
@@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
1087#ifdef CONFIG_COMPAT 1169#ifdef CONFIG_COMPAT
1088 .compat_ioctl = xfs_file_compat_ioctl, 1170 .compat_ioctl = xfs_file_compat_ioctl,
1089#endif 1171#endif
1090 .fsync = xfs_file_fsync, 1172 .fsync = xfs_dir_fsync,
1091}; 1173};
1092 1174
1093static const struct vm_operations_struct xfs_file_vm_ops = { 1175static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23de..5170306a100 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
682 ip = ap->ip; 682 ip = ap->ip;
683 mp = ip->i_mount; 683 mp = ip->i_mount;
684 cache = mp->m_filestream; 684 cache = mp->m_filestream;
685 minlen = ap->alen; 685 minlen = ap->length;
686 *agp = NULLAGNUMBER; 686 *agp = NULLAGNUMBER;
687 687
688 /* 688 /*
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
761 */ 761 */
762 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; 762 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
763 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 763 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
764 (ap->low ? XFS_PICK_LOWSPACE : 0); 764 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
765 765
766 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); 766 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
767 if (err || *agp == NULLAGNUMBER) 767 if (err || *agp == NULLAGNUMBER)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77ca..1c6fdeb702f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
194 bp = xfs_buf_get(mp->m_ddev_targp, 194 bp = xfs_buf_get(mp->m_ddev_targp,
195 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 195 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
196 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 196 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
197 if (!bp) {
198 error = ENOMEM;
199 goto error0;
200 }
197 agf = XFS_BUF_TO_AGF(bp); 201 agf = XFS_BUF_TO_AGF(bp);
198 memset(agf, 0, mp->m_sb.sb_sectsize); 202 memset(agf, 0, mp->m_sb.sb_sectsize);
199 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 203 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -216,16 +220,21 @@ xfs_growfs_data_private(
216 tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); 220 tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
217 agf->agf_freeblks = cpu_to_be32(tmpsize); 221 agf->agf_freeblks = cpu_to_be32(tmpsize);
218 agf->agf_longest = cpu_to_be32(tmpsize); 222 agf->agf_longest = cpu_to_be32(tmpsize);
219 error = xfs_bwrite(mp, bp); 223 error = xfs_bwrite(bp);
220 if (error) { 224 xfs_buf_relse(bp);
225 if (error)
221 goto error0; 226 goto error0;
222 } 227
223 /* 228 /*
224 * AG inode header block 229 * AG inode header block
225 */ 230 */
226 bp = xfs_buf_get(mp->m_ddev_targp, 231 bp = xfs_buf_get(mp->m_ddev_targp,
227 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 232 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
228 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 233 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
234 if (!bp) {
235 error = ENOMEM;
236 goto error0;
237 }
229 agi = XFS_BUF_TO_AGI(bp); 238 agi = XFS_BUF_TO_AGI(bp);
230 memset(agi, 0, mp->m_sb.sb_sectsize); 239 memset(agi, 0, mp->m_sb.sb_sectsize);
231 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 240 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -240,10 +249,11 @@ xfs_growfs_data_private(
240 agi->agi_dirino = cpu_to_be32(NULLAGINO); 249 agi->agi_dirino = cpu_to_be32(NULLAGINO);
241 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) 250 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
242 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 251 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
243 error = xfs_bwrite(mp, bp); 252 error = xfs_bwrite(bp);
244 if (error) { 253 xfs_buf_relse(bp);
254 if (error)
245 goto error0; 255 goto error0;
246 } 256
247 /* 257 /*
248 * BNO btree root block 258 * BNO btree root block
249 */ 259 */
@@ -251,6 +261,10 @@ xfs_growfs_data_private(
251 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 261 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
252 BTOBB(mp->m_sb.sb_blocksize), 262 BTOBB(mp->m_sb.sb_blocksize),
253 XBF_LOCK | XBF_MAPPED); 263 XBF_LOCK | XBF_MAPPED);
264 if (!bp) {
265 error = ENOMEM;
266 goto error0;
267 }
254 block = XFS_BUF_TO_BLOCK(bp); 268 block = XFS_BUF_TO_BLOCK(bp);
255 memset(block, 0, mp->m_sb.sb_blocksize); 269 memset(block, 0, mp->m_sb.sb_blocksize);
256 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 270 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -262,10 +276,11 @@ xfs_growfs_data_private(
262 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 276 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
263 arec->ar_blockcount = cpu_to_be32( 277 arec->ar_blockcount = cpu_to_be32(
264 agsize - be32_to_cpu(arec->ar_startblock)); 278 agsize - be32_to_cpu(arec->ar_startblock));
265 error = xfs_bwrite(mp, bp); 279 error = xfs_bwrite(bp);
266 if (error) { 280 xfs_buf_relse(bp);
281 if (error)
267 goto error0; 282 goto error0;
268 } 283
269 /* 284 /*
270 * CNT btree root block 285 * CNT btree root block
271 */ 286 */
@@ -273,6 +288,10 @@ xfs_growfs_data_private(
273 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 288 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
274 BTOBB(mp->m_sb.sb_blocksize), 289 BTOBB(mp->m_sb.sb_blocksize),
275 XBF_LOCK | XBF_MAPPED); 290 XBF_LOCK | XBF_MAPPED);
291 if (!bp) {
292 error = ENOMEM;
293 goto error0;
294 }
276 block = XFS_BUF_TO_BLOCK(bp); 295 block = XFS_BUF_TO_BLOCK(bp);
277 memset(block, 0, mp->m_sb.sb_blocksize); 296 memset(block, 0, mp->m_sb.sb_blocksize);
278 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 297 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -285,10 +304,11 @@ xfs_growfs_data_private(
285 arec->ar_blockcount = cpu_to_be32( 304 arec->ar_blockcount = cpu_to_be32(
286 agsize - be32_to_cpu(arec->ar_startblock)); 305 agsize - be32_to_cpu(arec->ar_startblock));
287 nfree += be32_to_cpu(arec->ar_blockcount); 306 nfree += be32_to_cpu(arec->ar_blockcount);
288 error = xfs_bwrite(mp, bp); 307 error = xfs_bwrite(bp);
289 if (error) { 308 xfs_buf_relse(bp);
309 if (error)
290 goto error0; 310 goto error0;
291 } 311
292 /* 312 /*
293 * INO btree root block 313 * INO btree root block
294 */ 314 */
@@ -296,6 +316,10 @@ xfs_growfs_data_private(
296 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 316 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
297 BTOBB(mp->m_sb.sb_blocksize), 317 BTOBB(mp->m_sb.sb_blocksize),
298 XBF_LOCK | XBF_MAPPED); 318 XBF_LOCK | XBF_MAPPED);
319 if (!bp) {
320 error = ENOMEM;
321 goto error0;
322 }
299 block = XFS_BUF_TO_BLOCK(bp); 323 block = XFS_BUF_TO_BLOCK(bp);
300 memset(block, 0, mp->m_sb.sb_blocksize); 324 memset(block, 0, mp->m_sb.sb_blocksize);
301 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 325 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -303,10 +327,10 @@ xfs_growfs_data_private(
303 block->bb_numrecs = 0; 327 block->bb_numrecs = 0;
304 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 328 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
305 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 329 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
306 error = xfs_bwrite(mp, bp); 330 error = xfs_bwrite(bp);
307 if (error) { 331 xfs_buf_relse(bp);
332 if (error)
308 goto error0; 333 goto error0;
309 }
310 } 334 }
311 xfs_trans_agblocks_delta(tp, nfree); 335 xfs_trans_agblocks_delta(tp, nfree);
312 /* 336 /*
@@ -396,9 +420,9 @@ xfs_growfs_data_private(
396 * just issue a warning and continue. The real work is 420 * just issue a warning and continue. The real work is
397 * already done and committed. 421 * already done and committed.
398 */ 422 */
399 if (!(error = xfs_bwrite(mp, bp))) { 423 error = xfs_bwrite(bp);
400 continue; 424 xfs_buf_relse(bp);
401 } else { 425 if (error) {
402 xfs_warn(mp, 426 xfs_warn(mp,
403 "write error %d updating secondary superblock for ag %d", 427 "write error %d updating secondary superblock for ag %d",
404 error, agno); 428 error, agno);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9f24ec28283..169380e6605 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
150/* 150/*
151 * Initialise a new set of inodes. 151 * Initialise a new set of inodes.
152 */ 152 */
153STATIC void 153STATIC int
154xfs_ialloc_inode_init( 154xfs_ialloc_inode_init(
155 struct xfs_mount *mp, 155 struct xfs_mount *mp,
156 struct xfs_trans *tp, 156 struct xfs_trans *tp,
@@ -202,8 +202,8 @@ xfs_ialloc_inode_init(
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 203 mp->m_bsize * blks_per_cluster,
204 XBF_LOCK); 204 XBF_LOCK);
205 ASSERT(!xfs_buf_geterror(fbuf)); 205 if (!fbuf)
206 206 return ENOMEM;
207 /* 207 /*
208 * Initialize all inodes in this buffer and then log them. 208 * Initialize all inodes in this buffer and then log them.
209 * 209 *
@@ -225,6 +225,7 @@ xfs_ialloc_inode_init(
225 } 225 }
226 xfs_trans_inode_alloc_buf(tp, fbuf); 226 xfs_trans_inode_alloc_buf(tp, fbuf);
227 } 227 }
228 return 0;
228} 229}
229 230
230/* 231/*
@@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc(
369 * rather than a linear progression to prevent the next generation 370 * rather than a linear progression to prevent the next generation
370 * number from being easily guessable. 371 * number from being easily guessable.
371 */ 372 */
372 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, 373 error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
373 random32()); 374 args.len, random32());
374 375
376 if (error)
377 return error;
375 /* 378 /*
376 * Convert the results. 379 * Convert the results.
377 */ 380 */
@@ -1502,7 +1505,7 @@ xfs_read_agi(
1502 return XFS_ERROR(EFSCORRUPTED); 1505 return XFS_ERROR(EFSCORRUPTED);
1503 } 1506 }
1504 1507
1505 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); 1508 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1506 1509
1507 xfs_check_agi_unlinked(agi); 1510 xfs_check_agi_unlinked(agi);
1508 return 0; 1511 return 0;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bb..0fa98b1c70e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
75 return NULL; 75 return NULL;
76 } 76 }
77 77
78 ASSERT(atomic_read(&ip->i_iocount) == 0);
79 ASSERT(atomic_read(&ip->i_pincount) == 0); 78 ASSERT(atomic_read(&ip->i_pincount) == 0);
80 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 79 ASSERT(!spin_is_locked(&ip->i_flags_lock));
81 ASSERT(completion_done(&ip->i_flush)); 80 ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
150 } 149 }
151 150
152 /* asserts to verify all state is correct here */ 151 /* asserts to verify all state is correct here */
153 ASSERT(atomic_read(&ip->i_iocount) == 0);
154 ASSERT(atomic_read(&ip->i_pincount) == 0); 152 ASSERT(atomic_read(&ip->i_pincount) == 0);
155 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 153 ASSERT(!spin_is_locked(&ip->i_flags_lock));
156 ASSERT(completion_done(&ip->i_flush)); 154 ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0239a7c7c88..c0237c602f1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
190 } 190 }
191 191
192 xfs_inobp_check(mp, bp); 192 xfs_inobp_check(mp, bp);
193
194 /*
195 * Mark the buffer as an inode buffer now that it looks good
196 */
197 XFS_BUF_SET_VTYPE(bp, B_FS_INO);
198
199 *bpp = bp; 193 *bpp = bp;
200 return 0; 194 return 0;
201} 195}
@@ -1152,7 +1146,7 @@ xfs_ialloc(
1152 /* 1146 /*
1153 * Log the new values stuffed into the inode. 1147 * Log the new values stuffed into the inode.
1154 */ 1148 */
1155 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1149 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1156 xfs_trans_log_inode(tp, ip, flags); 1150 xfs_trans_log_inode(tp, ip, flags);
1157 1151
1158 /* now that we have an i_mode we can setup inode ops and unlock */ 1152 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1187,6 +1181,7 @@ xfs_isize_check(
1187 xfs_fileoff_t map_first; 1181 xfs_fileoff_t map_first;
1188 int nimaps; 1182 int nimaps;
1189 xfs_bmbt_irec_t imaps[2]; 1183 xfs_bmbt_irec_t imaps[2];
1184 int error;
1190 1185
1191 if (!S_ISREG(ip->i_d.di_mode)) 1186 if (!S_ISREG(ip->i_d.di_mode))
1192 return; 1187 return;
@@ -1203,13 +1198,12 @@ xfs_isize_check(
1203 * The filesystem could be shutting down, so bmapi may return 1198 * The filesystem could be shutting down, so bmapi may return
1204 * an error. 1199 * an error.
1205 */ 1200 */
1206 if (xfs_bmapi(NULL, ip, map_first, 1201 error = xfs_bmapi_read(ip, map_first,
1207 (XFS_B_TO_FSB(mp, 1202 (XFS_B_TO_FSB(mp,
1208 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
1209 map_first), 1204 imaps, &nimaps, XFS_BMAPI_ENTIRE);
1210 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1205 if (error)
1211 NULL)) 1206 return;
1212 return;
1213 ASSERT(nimaps == 1); 1207 ASSERT(nimaps == 1);
1214 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1215} 1209}
@@ -1297,7 +1291,7 @@ xfs_itruncate_extents(
1297 */ 1291 */
1298 error = xfs_bmap_finish(&tp, &free_list, &committed); 1292 error = xfs_bmap_finish(&tp, &free_list, &committed);
1299 if (committed) 1293 if (committed)
1300 xfs_trans_ijoin(tp, ip); 1294 xfs_trans_ijoin(tp, ip, 0);
1301 if (error) 1295 if (error)
1302 goto out_bmap_cancel; 1296 goto out_bmap_cancel;
1303 1297
@@ -1313,7 +1307,7 @@ xfs_itruncate_extents(
1313 error = xfs_trans_commit(tp, 0); 1307 error = xfs_trans_commit(tp, 0);
1314 tp = ntp; 1308 tp = ntp;
1315 1309
1316 xfs_trans_ijoin(tp, ip); 1310 xfs_trans_ijoin(tp, ip, 0);
1317 1311
1318 if (error) 1312 if (error)
1319 goto out; 1313 goto out;
@@ -1644,7 +1638,7 @@ xfs_iunlink_remove(
1644 * inodes that are in memory - they all must be marked stale and attached to 1638 * inodes that are in memory - they all must be marked stale and attached to
1645 * the cluster buffer. 1639 * the cluster buffer.
1646 */ 1640 */
1647STATIC void 1641STATIC int
1648xfs_ifree_cluster( 1642xfs_ifree_cluster(
1649 xfs_inode_t *free_ip, 1643 xfs_inode_t *free_ip,
1650 xfs_trans_t *tp, 1644 xfs_trans_t *tp,
@@ -1690,6 +1684,8 @@ xfs_ifree_cluster(
1690 mp->m_bsize * blks_per_cluster, 1684 mp->m_bsize * blks_per_cluster,
1691 XBF_LOCK); 1685 XBF_LOCK);
1692 1686
1687 if (!bp)
1688 return ENOMEM;
1693 /* 1689 /*
1694 * Walk the inodes already attached to the buffer and mark them 1690 * Walk the inodes already attached to the buffer and mark them
1695 * stale. These will all have the flush locks held, so an 1691 * stale. These will all have the flush locks held, so an
@@ -1799,6 +1795,7 @@ retry:
1799 } 1795 }
1800 1796
1801 xfs_perag_put(pag); 1797 xfs_perag_put(pag);
1798 return 0;
1802} 1799}
1803 1800
1804/* 1801/*
@@ -1878,10 +1875,10 @@ xfs_ifree(
1878 dip->di_mode = 0; 1875 dip->di_mode = 0;
1879 1876
1880 if (delete) { 1877 if (delete) {
1881 xfs_ifree_cluster(ip, tp, first_ino); 1878 error = xfs_ifree_cluster(ip, tp, first_ino);
1882 } 1879 }
1883 1880
1884 return 0; 1881 return error;
1885} 1882}
1886 1883
1887/* 1884/*
@@ -2472,11 +2469,11 @@ cluster_corrupt_out:
2472 */ 2469 */
2473 if (bp->b_iodone) { 2470 if (bp->b_iodone) {
2474 XFS_BUF_UNDONE(bp); 2471 XFS_BUF_UNDONE(bp);
2475 XFS_BUF_STALE(bp); 2472 xfs_buf_stale(bp);
2476 xfs_buf_ioerror(bp, EIO); 2473 xfs_buf_ioerror(bp, EIO);
2477 xfs_buf_ioend(bp, 0); 2474 xfs_buf_ioend(bp, 0);
2478 } else { 2475 } else {
2479 XFS_BUF_STALE(bp); 2476 xfs_buf_stale(bp);
2480 xfs_buf_relse(bp); 2477 xfs_buf_relse(bp);
2481 } 2478 }
2482 } 2479 }
@@ -2597,9 +2594,11 @@ xfs_iflush(
2597 goto cluster_corrupt_out; 2594 goto cluster_corrupt_out;
2598 2595
2599 if (flags & SYNC_WAIT) 2596 if (flags & SYNC_WAIT)
2600 error = xfs_bwrite(mp, bp); 2597 error = xfs_bwrite(bp);
2601 else 2598 else
2602 xfs_bdwrite(mp, bp); 2599 xfs_buf_delwri_queue(bp);
2600
2601 xfs_buf_relse(bp);
2603 return error; 2602 return error;
2604 2603
2605corrupt_out: 2604corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbec..760140d1dd6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
257 257
258 xfs_fsize_t i_size; /* in-memory size */ 258 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */ 259 xfs_fsize_t i_new_size; /* size when write completes */
260 atomic_t i_iocount; /* outstanding I/O count */
261 260
262 /* VFS inode */ 261 /* VFS inode */
263 struct inode i_vnode; /* embedded VFS inode */ 262 struct inode i_vnode; /* embedded VFS inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 836ad80d4f2..abaafdbb3e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
658 658
659 lock_flags = iip->ili_lock_flags; 659 lock_flags = iip->ili_lock_flags;
660 iip->ili_lock_flags = 0; 660 iip->ili_lock_flags = 0;
661 if (lock_flags) { 661 if (lock_flags)
662 xfs_iunlock(ip, lock_flags); 662 xfs_iunlock(ip, lock_flags);
663 IRELE(ip);
664 }
665} 663}
666 664
667/* 665/*
@@ -797,7 +795,7 @@ xfs_inode_item_committing(
797/* 795/*
798 * This is the ops vector shared by all buf log items. 796 * This is the ops vector shared by all buf log items.
799 */ 797 */
800static struct xfs_item_ops xfs_inode_item_ops = { 798static const struct xfs_item_ops xfs_inode_item_ops = {
801 .iop_size = xfs_inode_item_size, 799 .iop_size = xfs_inode_item_size,
802 .iop_format = xfs_inode_item_format, 800 .iop_format = xfs_inode_item_format,
803 .iop_pin = xfs_inode_item_pin, 801 .iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14..d99a9051890 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
1069 } 1069 }
1070 } 1070 }
1071 1071
1072 xfs_trans_ijoin(tp, ip); 1072 xfs_trans_ijoin(tp, ip, 0);
1073 1073
1074 /* 1074 /*
1075 * Change file ownership. Must be the owner or privileged. 1075 * Change file ownership. Must be the owner or privileged.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4..9afa282aa93 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,22 +208,20 @@ xfs_iomap_write_direct(
208 if (error) 208 if (error)
209 goto error1; 209 goto error1;
210 210
211 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip, 0);
212 212
213 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = 0;
214 if (offset < ip->i_size || extsz) 214 if (offset < ip->i_size || extsz)
215 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
216 216
217 /* 217 /*
218 * Issue the xfs_bmapi() call to allocate the blocks.
219 *
220 * From this point onwards we overwrite the imap pointer that the 218 * From this point onwards we overwrite the imap pointer that the
221 * caller gave to us. 219 * caller gave to us.
222 */ 220 */
223 xfs_bmap_init(&free_list, &firstfsb); 221 xfs_bmap_init(&free_list, &firstfsb);
224 nimaps = 1; 222 nimaps = 1;
225 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 223 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
226 &firstfsb, 0, imap, &nimaps, &free_list); 224 &firstfsb, 0, imap, &nimaps, &free_list);
227 if (error) 225 if (error)
228 goto error0; 226 goto error0;
229 227
@@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate(
300 while (count_fsb > 0) { 298 while (count_fsb > 0) {
301 imaps = nimaps; 299 imaps = nimaps;
302 firstblock = NULLFSBLOCK; 300 firstblock = NULLFSBLOCK;
303 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, 301 error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
304 &firstblock, 0, imap, &imaps, NULL); 302 0);
305 if (error) 303 if (error)
306 return error; 304 return error;
307 for (n = 0; n < imaps; n++) { 305 for (n = 0; n < imaps; n++) {
@@ -381,7 +379,6 @@ xfs_iomap_write_delay(
381 xfs_fileoff_t last_fsb; 379 xfs_fileoff_t last_fsb;
382 xfs_off_t aligned_offset; 380 xfs_off_t aligned_offset;
383 xfs_fileoff_t ioalign; 381 xfs_fileoff_t ioalign;
384 xfs_fsblock_t firstblock;
385 xfs_extlen_t extsz; 382 xfs_extlen_t extsz;
386 int nimaps; 383 int nimaps;
387 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 384 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +422,8 @@ retry:
425 } 422 }
426 423
427 nimaps = XFS_WRITE_IMAPS; 424 nimaps = XFS_WRITE_IMAPS;
428 firstblock = NULLFSBLOCK; 425 error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
429 error = xfs_bmapi(NULL, ip, offset_fsb, 426 imap, &nimaps, XFS_BMAPI_ENTIRE);
430 (xfs_filblks_t)(last_fsb - offset_fsb),
431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
433 &nimaps, NULL);
434 switch (error) { 427 switch (error) {
435 case 0: 428 case 0:
436 case ENOSPC: 429 case ENOSPC:
@@ -535,7 +528,7 @@ xfs_iomap_write_allocate(
535 return XFS_ERROR(error); 528 return XFS_ERROR(error);
536 } 529 }
537 xfs_ilock(ip, XFS_ILOCK_EXCL); 530 xfs_ilock(ip, XFS_ILOCK_EXCL);
538 xfs_trans_ijoin(tp, ip); 531 xfs_trans_ijoin(tp, ip, 0);
539 532
540 xfs_bmap_init(&free_list, &first_block); 533 xfs_bmap_init(&free_list, &first_block);
541 534
@@ -587,14 +580,12 @@ xfs_iomap_write_allocate(
587 } 580 }
588 581
589 /* 582 /*
590 * Go get the actual blocks.
591 *
592 * From this point onwards we overwrite the imap 583 * From this point onwards we overwrite the imap
593 * pointer that the caller gave to us. 584 * pointer that the caller gave to us.
594 */ 585 */
595 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, 586 error = xfs_bmapi_write(tp, ip, map_start_fsb,
596 XFS_BMAPI_WRITE, &first_block, 1, 587 count_fsb, 0, &first_block, 1,
597 imap, &nimaps, &free_list); 588 imap, &nimaps, &free_list);
598 if (error) 589 if (error)
599 goto trans_cancel; 590 goto trans_cancel;
600 591
@@ -701,15 +692,15 @@ xfs_iomap_write_unwritten(
701 } 692 }
702 693
703 xfs_ilock(ip, XFS_ILOCK_EXCL); 694 xfs_ilock(ip, XFS_ILOCK_EXCL);
704 xfs_trans_ijoin(tp, ip); 695 xfs_trans_ijoin(tp, ip, 0);
705 696
706 /* 697 /*
707 * Modify the unwritten extent state of the buffer. 698 * Modify the unwritten extent state of the buffer.
708 */ 699 */
709 xfs_bmap_init(&free_list, &firstfsb); 700 xfs_bmap_init(&free_list, &firstfsb);
710 nimaps = 1; 701 nimaps = 1;
711 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 702 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
712 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 703 XFS_BMAPI_CONVERT, &firstfsb,
713 1, &imap, &nimaps, &free_list); 704 1, &imap, &nimaps, &free_list);
714 if (error) 705 if (error)
715 goto error_on_bmapi_transaction; 706 goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 28856accb4f..23ce927973a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -466,7 +466,7 @@ xfs_vn_getattr(
466 trace_xfs_getattr(ip); 466 trace_xfs_getattr(ip);
467 467
468 if (XFS_FORCED_SHUTDOWN(mp)) 468 if (XFS_FORCED_SHUTDOWN(mp))
469 return XFS_ERROR(EIO); 469 return -XFS_ERROR(EIO);
470 470
471 stat->size = XFS_ISIZE(ip); 471 stat->size = XFS_ISIZE(ip);
472 stat->dev = inode->i_sb->s_dev; 472 stat->dev = inode->i_sb->s_dev;
@@ -612,7 +612,7 @@ xfs_setattr_nonsize(
612 } 612 }
613 } 613 }
614 614
615 xfs_trans_ijoin(tp, ip); 615 xfs_trans_ijoin(tp, ip, 0);
616 616
617 /* 617 /*
618 * Change file ownership. Must be the owner or privileged. 618 * Change file ownership. Must be the owner or privileged.
@@ -834,16 +834,16 @@ xfs_setattr_size(
834 * care about here. 834 * care about here.
835 */ 835 */
836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { 836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
838 XBF_ASYNC, FI_NONE); 838 FI_NONE);
839 if (error) 839 if (error)
840 goto out_unlock; 840 goto out_unlock;
841 } 841 }
842 842
843 /* 843 /*
844 * Wait for all I/O to complete. 844 * Wait for all direct I/O to complete.
845 */ 845 */
846 xfs_ioend_wait(ip); 846 inode_dio_wait(inode);
847 847
848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size, 848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
849 xfs_get_blocks); 849 xfs_get_blocks);
@@ -864,7 +864,7 @@ xfs_setattr_size(
864 864
865 xfs_ilock(ip, XFS_ILOCK_EXCL); 865 xfs_ilock(ip, XFS_ILOCK_EXCL);
866 866
867 xfs_trans_ijoin(tp, ip); 867 xfs_trans_ijoin(tp, ip, 0);
868 868
869 /* 869 /*
870 * Only change the c/mtime if we are changing the size or we are 870 * Only change the c/mtime if we are changing the size or we are
@@ -1153,7 +1153,7 @@ xfs_setup_inode(
1153 hlist_add_fake(&inode->i_hash); 1153 hlist_add_fake(&inode->i_hash);
1154 1154
1155 inode->i_mode = ip->i_d.di_mode; 1155 inode->i_mode = ip->i_d.di_mode;
1156 inode->i_nlink = ip->i_d.di_nlink; 1156 set_nlink(inode, ip->i_d.di_nlink);
1157 inode->i_uid = ip->i_d.di_uid; 1157 inode->i_uid = ip->i_d.di_uid;
1158 inode->i_gid = ip->i_d.di_gid; 1158 inode->i_gid = ip->i_d.di_gid;
1159 1159
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3a8d4f66d70..a14cd89fe46 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -626,7 +626,7 @@ xfs_log_item_init(
626 struct xfs_mount *mp, 626 struct xfs_mount *mp,
627 struct xfs_log_item *item, 627 struct xfs_log_item *item,
628 int type, 628 int type,
629 struct xfs_item_ops *ops) 629 const struct xfs_item_ops *ops)
630{ 630{
631 item->li_mountp = mp; 631 item->li_mountp = mp;
632 item->li_ailp = mp->m_ail; 632 item->li_ailp = mp->m_ail;
@@ -880,8 +880,8 @@ xlog_iodone(xfs_buf_t *bp)
880 */ 880 */
881 if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, 881 if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { 882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
883 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); 883 xfs_buf_ioerror_alert(bp, __func__);
884 XFS_BUF_STALE(bp); 884 xfs_buf_stale(bp);
885 xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); 885 xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
886 /* 886 /*
887 * This flag will be propagated to the trans-committed 887 * This flag will be propagated to the trans-committed
@@ -1047,7 +1047,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 xlog_get_iclog_buffer_size(mp, log); 1047 xlog_get_iclog_buffer_size(mp, log);
1048 1048
1049 error = ENOMEM; 1049 error = ENOMEM;
1050 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); 1050 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
1051 if (!bp) 1051 if (!bp)
1052 goto out_free_log; 1052 goto out_free_log;
1053 bp->b_iodone = xlog_iodone; 1053 bp->b_iodone = xlog_iodone;
@@ -1247,7 +1247,7 @@ xlog_bdstrat(
1247 1247
1248 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1248 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1249 xfs_buf_ioerror(bp, EIO); 1249 xfs_buf_ioerror(bp, EIO);
1250 XFS_BUF_STALE(bp); 1250 xfs_buf_stale(bp);
1251 xfs_buf_ioend(bp, 0); 1251 xfs_buf_ioend(bp, 0);
1252 /* 1252 /*
1253 * It would seem logical to return EIO here, but we rely on 1253 * It would seem logical to return EIO here, but we rely on
@@ -1387,9 +1387,9 @@ xlog_sync(xlog_t *log,
1387 */ 1387 */
1388 XFS_BUF_WRITE(bp); 1388 XFS_BUF_WRITE(bp);
1389 1389
1390 if ((error = xlog_bdstrat(bp))) { 1390 error = xlog_bdstrat(bp);
1391 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1391 if (error) {
1392 XFS_BUF_ADDR(bp)); 1392 xfs_buf_ioerror_alert(bp, "xlog_sync");
1393 return error; 1393 return error;
1394 } 1394 }
1395 if (split) { 1395 if (split) {
@@ -1423,9 +1423,9 @@ xlog_sync(xlog_t *log,
1423 /* account for internal log which doesn't start at block #0 */ 1423 /* account for internal log which doesn't start at block #0 */
1424 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1424 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1425 XFS_BUF_WRITE(bp); 1425 XFS_BUF_WRITE(bp);
1426 if ((error = xlog_bdstrat(bp))) { 1426 error = xlog_bdstrat(bp);
1427 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1427 if (error) {
1428 bp, XFS_BUF_ADDR(bp)); 1428 xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
1429 return error; 1429 return error;
1430 } 1430 }
1431 } 1431 }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994a..3f7bf451c03 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
137void xfs_log_item_init(struct xfs_mount *mp, 137void xfs_log_item_init(struct xfs_mount *mp,
138 struct xfs_log_item *item, 138 struct xfs_log_item *item,
139 int type, 139 int type,
140 struct xfs_item_ops *ops); 140 const struct xfs_item_ops *ops);
141 141
142xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 142xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
143 struct xlog_ticket *ticket, 143 struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a199dbcee7d..541a508adea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -183,8 +183,7 @@ xlog_bread_noalign(
183 xfsbdstrat(log->l_mp, bp); 183 xfsbdstrat(log->l_mp, bp);
184 error = xfs_buf_iowait(bp); 184 error = xfs_buf_iowait(bp);
185 if (error) 185 if (error)
186 xfs_ioerror_alert("xlog_bread", log->l_mp, 186 xfs_buf_ioerror_alert(bp, __func__);
187 bp, XFS_BUF_ADDR(bp));
188 return error; 187 return error;
189} 188}
190 189
@@ -268,9 +267,10 @@ xlog_bwrite(
268 xfs_buf_lock(bp); 267 xfs_buf_lock(bp);
269 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 268 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
270 269
271 if ((error = xfs_bwrite(log->l_mp, bp))) 270 error = xfs_bwrite(bp);
272 xfs_ioerror_alert("xlog_bwrite", log->l_mp, 271 if (error)
273 bp, XFS_BUF_ADDR(bp)); 272 xfs_buf_ioerror_alert(bp, __func__);
273 xfs_buf_relse(bp);
274 return error; 274 return error;
275} 275}
276 276
@@ -361,9 +361,7 @@ xlog_recover_iodone(
361 * We're not going to bother about retrying 361 * We're not going to bother about retrying
362 * this during recovery. One strike! 362 * this during recovery. One strike!
363 */ 363 */
364 xfs_ioerror_alert("xlog_recover_iodone", 364 xfs_buf_ioerror_alert(bp, __func__);
365 bp->b_target->bt_mount, bp,
366 XFS_BUF_ADDR(bp));
367 xfs_force_shutdown(bp->b_target->bt_mount, 365 xfs_force_shutdown(bp->b_target->bt_mount,
368 SHUTDOWN_META_IO_ERROR); 366 SHUTDOWN_META_IO_ERROR);
369 } 367 }
@@ -2135,8 +2133,7 @@ xlog_recover_buffer_pass2(
2135 return XFS_ERROR(ENOMEM); 2133 return XFS_ERROR(ENOMEM);
2136 error = bp->b_error; 2134 error = bp->b_error;
2137 if (error) { 2135 if (error) {
2138 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, 2136 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2139 bp, buf_f->blf_blkno);
2140 xfs_buf_relse(bp); 2137 xfs_buf_relse(bp);
2141 return error; 2138 return error;
2142 } 2139 }
@@ -2171,15 +2168,16 @@ xlog_recover_buffer_pass2(
2171 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2168 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2172 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, 2169 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2173 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2170 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2174 XFS_BUF_STALE(bp); 2171 xfs_buf_stale(bp);
2175 error = xfs_bwrite(mp, bp); 2172 error = xfs_bwrite(bp);
2176 } else { 2173 } else {
2177 ASSERT(bp->b_target->bt_mount == mp); 2174 ASSERT(bp->b_target->bt_mount == mp);
2178 bp->b_iodone = xlog_recover_iodone; 2175 bp->b_iodone = xlog_recover_iodone;
2179 xfs_bdwrite(mp, bp); 2176 xfs_buf_delwri_queue(bp);
2180 } 2177 }
2181 2178
2182 return (error); 2179 xfs_buf_relse(bp);
2180 return error;
2183} 2181}
2184 2182
2185STATIC int 2183STATIC int
@@ -2230,8 +2228,7 @@ xlog_recover_inode_pass2(
2230 } 2228 }
2231 error = bp->b_error; 2229 error = bp->b_error;
2232 if (error) { 2230 if (error) {
2233 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2231 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2234 bp, in_f->ilf_blkno);
2235 xfs_buf_relse(bp); 2232 xfs_buf_relse(bp);
2236 goto error; 2233 goto error;
2237 } 2234 }
@@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2(
2439write_inode_buffer: 2436write_inode_buffer:
2440 ASSERT(bp->b_target->bt_mount == mp); 2437 ASSERT(bp->b_target->bt_mount == mp);
2441 bp->b_iodone = xlog_recover_iodone; 2438 bp->b_iodone = xlog_recover_iodone;
2442 xfs_bdwrite(mp, bp); 2439 xfs_buf_delwri_queue(bp);
2440 xfs_buf_relse(bp);
2443error: 2441error:
2444 if (need_free) 2442 if (need_free)
2445 kmem_free(in_f); 2443 kmem_free(in_f);
@@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2(
2537 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 2535 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2538 0, &bp); 2536 0, &bp);
2539 if (error) { 2537 if (error) {
2540 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, 2538 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
2541 bp, dq_f->qlf_blkno);
2542 return error; 2539 return error;
2543 } 2540 }
2544 ASSERT(bp); 2541 ASSERT(bp);
@@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2(
2561 ASSERT(dq_f->qlf_size == 2); 2558 ASSERT(dq_f->qlf_size == 2);
2562 ASSERT(bp->b_target->bt_mount == mp); 2559 ASSERT(bp->b_target->bt_mount == mp);
2563 bp->b_iodone = xlog_recover_iodone; 2560 bp->b_iodone = xlog_recover_iodone;
2564 xfs_bdwrite(mp, bp); 2561 xfs_buf_delwri_queue(bp);
2562 xfs_buf_relse(bp);
2565 2563
2566 return (0); 2564 return (0);
2567} 2565}
@@ -3656,7 +3654,7 @@ xlog_do_recover(
3656 return error; 3654 return error;
3657 } 3655 }
3658 3656
3659 XFS_bflush(log->l_mp->m_ddev_targp); 3657 xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3660 3658
3661 /* 3659 /*
3662 * If IO errors happened during recovery, bail out. 3660 * If IO errors happened during recovery, bail out.
@@ -3689,8 +3687,7 @@ xlog_do_recover(
3689 xfsbdstrat(log->l_mp, bp); 3687 xfsbdstrat(log->l_mp, bp);
3690 error = xfs_buf_iowait(bp); 3688 error = xfs_buf_iowait(bp);
3691 if (error) { 3689 if (error) {
3692 xfs_ioerror_alert("xlog_do_recover", 3690 xfs_buf_ioerror_alert(bp, __func__);
3693 log->l_mp, bp, XFS_BUF_ADDR(bp));
3694 ASSERT(0); 3691 ASSERT(0);
3695 xfs_buf_relse(bp); 3692 xfs_buf_relse(bp);
3696 return error; 3693 return error;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea00767..56dc0c17f16 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -3,31 +3,29 @@
3 3
4struct xfs_mount; 4struct xfs_mount;
5 5
6extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) 6extern __printf(2, 3)
7 __attribute__ ((format (printf, 2, 3))); 7void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
8extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) 8extern __printf(2, 3)
9 __attribute__ ((format (printf, 2, 3))); 9void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
10extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, 10extern __printf(3, 4)
11 const char *fmt, ...) 11void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
12 __attribute__ ((format (printf, 3, 4))); 12extern __printf(2, 3)
13extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) 13void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
14 __attribute__ ((format (printf, 2, 3))); 14extern __printf(2, 3)
15extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) 15void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
16 __attribute__ ((format (printf, 2, 3))); 16extern __printf(2, 3)
17extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) 17void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
18 __attribute__ ((format (printf, 2, 3))); 18extern __printf(2, 3)
19extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) 19void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
20 __attribute__ ((format (printf, 2, 3))); 20extern __printf(2, 3)
21extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) 21void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
22 __attribute__ ((format (printf, 2, 3)));
23 22
24#ifdef DEBUG 23#ifdef DEBUG
25extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) 24extern __printf(2, 3)
26 __attribute__ ((format (printf, 2, 3))); 25void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
27#else 26#else
28static inline void 27static inline __printf(2, 3)
29__attribute__ ((format (printf, 2, 3))) 28void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
30xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
31{ 29{
32} 30}
33#endif 31#endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0081657ad98..d06afbc3540 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45 45
46 46
47STATIC void xfs_unmountfs_wait(xfs_mount_t *);
48
49
50#ifdef HAVE_PERCPU_SB 47#ifdef HAVE_PERCPU_SB
51STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, 48STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 49 int);
@@ -1484,7 +1481,7 @@ xfs_unmountfs(
1484 * state as much as possible. 1481 * state as much as possible.
1485 */ 1482 */
1486 xfs_reclaim_inodes(mp, 0); 1483 xfs_reclaim_inodes(mp, 0);
1487 XFS_bflush(mp->m_ddev_targp); 1484 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1488 xfs_reclaim_inodes(mp, SYNC_WAIT); 1485 xfs_reclaim_inodes(mp, SYNC_WAIT);
1489 1486
1490 xfs_qm_unmount(mp); 1487 xfs_qm_unmount(mp);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
1496 */ 1493 */
1497 xfs_log_force(mp, XFS_LOG_SYNC); 1494 xfs_log_force(mp, XFS_LOG_SYNC);
1498 1495
1499 xfs_binval(mp->m_ddev_targp);
1500 if (mp->m_rtdev_targp) {
1501 xfs_binval(mp->m_rtdev_targp);
1502 }
1503
1504 /* 1496 /*
1505 * Unreserve any blocks we have so that when we unmount we don't account 1497 * Unreserve any blocks we have so that when we unmount we don't account
1506 * the reserved free space as used. This is really only necessary for 1498 * the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
1526 xfs_warn(mp, "Unable to update superblock counters. " 1518 xfs_warn(mp, "Unable to update superblock counters. "
1527 "Freespace may not be correct on next mount."); 1519 "Freespace may not be correct on next mount.");
1528 xfs_unmountfs_writesb(mp); 1520 xfs_unmountfs_writesb(mp);
1529 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1521
1522 /*
1523 * Make sure all buffers have been flushed and completed before
1524 * unmounting the log.
1525 */
1526 error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
1527 if (error)
1528 xfs_warn(mp, "%d busy buffers during unmount.", error);
1529 xfs_wait_buftarg(mp->m_ddev_targp);
1530
1530 xfs_log_unmount_write(mp); 1531 xfs_log_unmount_write(mp);
1531 xfs_log_unmount(mp); 1532 xfs_log_unmount(mp);
1532 xfs_uuid_unmount(mp); 1533 xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
1537 xfs_free_perag(mp); 1538 xfs_free_perag(mp);
1538} 1539}
1539 1540
1540STATIC void
1541xfs_unmountfs_wait(xfs_mount_t *mp)
1542{
1543 if (mp->m_logdev_targp != mp->m_ddev_targp)
1544 xfs_wait_buftarg(mp->m_logdev_targp);
1545 if (mp->m_rtdev_targp)
1546 xfs_wait_buftarg(mp->m_rtdev_targp);
1547 xfs_wait_buftarg(mp->m_ddev_targp);
1548}
1549
1550int 1541int
1551xfs_fs_writable(xfs_mount_t *mp) 1542xfs_fs_writable(xfs_mount_t *mp)
1552{ 1543{
@@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1612 1603
1613 XFS_BUF_UNDONE(sbp); 1604 XFS_BUF_UNDONE(sbp);
1614 XFS_BUF_UNREAD(sbp); 1605 XFS_BUF_UNREAD(sbp);
1615 XFS_BUF_UNDELAYWRITE(sbp); 1606 xfs_buf_delwri_dequeue(sbp);
1616 XFS_BUF_WRITE(sbp); 1607 XFS_BUF_WRITE(sbp);
1617 XFS_BUF_UNASYNC(sbp); 1608 XFS_BUF_UNASYNC(sbp);
1618 ASSERT(sbp->b_target == mp->m_ddev_targp); 1609 ASSERT(sbp->b_target == mp->m_ddev_targp);
1619 xfsbdstrat(mp, sbp); 1610 xfsbdstrat(mp, sbp);
1620 error = xfs_buf_iowait(sbp); 1611 error = xfs_buf_iowait(sbp);
1621 if (error) 1612 if (error)
1622 xfs_ioerror_alert("xfs_unmountfs_writesb", 1613 xfs_buf_ioerror_alert(sbp, __func__);
1623 mp, sbp, XFS_BUF_ADDR(sbp));
1624 xfs_buf_relse(sbp); 1614 xfs_buf_relse(sbp);
1625 } 1615 }
1626 return error; 1616 return error;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9a0aa76facd..5cff443f6cd 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
1296 break; 1296 break;
1297 1297
1298 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 1298 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
1299 xfs_bdwrite(mp, bp); 1299 xfs_buf_delwri_queue(bp);
1300 xfs_buf_relse(bp);
1300 /* 1301 /*
1301 * goto the next block. 1302 * goto the next block.
1302 */ 1303 */
@@ -1346,11 +1347,8 @@ xfs_qm_dqiterate(
1346 * the inode is never added to the transaction. 1347 * the inode is never added to the transaction.
1347 */ 1348 */
1348 xfs_ilock(qip, XFS_ILOCK_SHARED); 1349 xfs_ilock(qip, XFS_ILOCK_SHARED);
1349 error = xfs_bmapi(NULL, qip, lblkno, 1350 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
1350 maxlblkcnt - lblkno, 1351 map, &nmaps, 0);
1351 XFS_BMAPI_METADATA,
1352 NULL,
1353 0, map, &nmaps, NULL);
1354 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1352 xfs_iunlock(qip, XFS_ILOCK_SHARED);
1355 if (error) 1353 if (error)
1356 break; 1354 break;
@@ -1683,7 +1681,7 @@ xfs_qm_quotacheck(
1683 * quotacheck'd stamp on the superblock. So, here we do a synchronous 1681 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1684 * flush. 1682 * flush.
1685 */ 1683 */
1686 XFS_bflush(mp->m_ddev_targp); 1684 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1687 1685
1688 /* 1686 /*
1689 * If one type of quotas is off, then it will lose its 1687 * If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6..5cc3dde1bc9 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
261 } 261 }
262 262
263 xfs_ilock(ip, XFS_ILOCK_EXCL); 263 xfs_ilock(ip, XFS_ILOCK_EXCL);
264 xfs_trans_ijoin(tp, ip); 264 xfs_trans_ijoin(tp, ip, 0);
265 265
266 error = xfs_itruncate_data(&tp, ip, 0); 266 error = xfs_itruncate_data(&tp, ip, 0);
267 if (error) { 267 if (error) {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a..866de277079 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
170 * we can rely on either trans_commit or trans_cancel to unlock 170 * we can rely on either trans_commit or trans_cancel to unlock
171 * them. 171 * them.
172 */ 172 */
173 xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); 173 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
174 if (new_parent) 174 if (new_parent)
175 xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); 175 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
176 xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); 176 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
177 if (target_ip) 177 if (target_ip)
178 xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); 178 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
179 179
180 /* 180 /*
181 * If we are using project inheritance, we only allow renames 181 * If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 35561a511b5..87323f1ded6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
112 * Lock the inode. 112 * Lock the inode.
113 */ 113 */
114 xfs_ilock(ip, XFS_ILOCK_EXCL); 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 115 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
116 116
117 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
118 /* 118 /*
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
120 */ 120 */
121 nmap = 1; 121 nmap = 1;
122 cancelflags |= XFS_TRANS_ABORT; 122 cancelflags |= XFS_TRANS_ABORT;
123 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, 123 error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
124 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, 124 XFS_BMAPI_METADATA, &firstblock,
125 resblks, &map, &nmap, &flist); 125 resblks, &map, &nmap, &flist);
126 if (!error && nmap < 1) 126 if (!error && nmap < 1)
127 error = XFS_ERROR(ENOSPC); 127 error = XFS_ERROR(ENOSPC);
128 if (error) 128 if (error)
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
155 * Lock the bitmap inode. 155 * Lock the bitmap inode.
156 */ 156 */
157 xfs_ilock(ip, XFS_ILOCK_EXCL); 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 158 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
159 /* 159 /*
160 * Get a buffer for the block. 160 * Get a buffer for the block.
161 */ 161 */
@@ -856,33 +856,23 @@ xfs_rtbuf_get(
856 xfs_buf_t **bpp) /* output: buffer for the block */ 856 xfs_buf_t **bpp) /* output: buffer for the block */
857{ 857{
858 xfs_buf_t *bp; /* block buffer, result */ 858 xfs_buf_t *bp; /* block buffer, result */
859 xfs_daddr_t d; /* disk addr of block */
860 int error; /* error value */
861 xfs_fsblock_t fsb; /* fs block number for block */
862 xfs_inode_t *ip; /* bitmap or summary inode */ 859 xfs_inode_t *ip; /* bitmap or summary inode */
860 xfs_bmbt_irec_t map;
861 int nmap;
862 int error; /* error value */
863 863
864 ip = issum ? mp->m_rsumip : mp->m_rbmip; 864 ip = issum ? mp->m_rsumip : mp->m_rbmip;
865 /* 865
866 * Map from the file offset (block) and inode number to the 866 error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
867 * file system block. 867 if (error)
868 */
869 error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
870 if (error) {
871 return error; 868 return error;
872 } 869
873 ASSERT(fsb != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
874 /* 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
875 * Convert to disk address for buffer cache. 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
876 */
877 d = XFS_FSB_TO_DADDR(mp, fsb);
878 /*
879 * Read the buffer.
880 */
881 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
882 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp);
883 if (error) { 874 if (error)
884 return error; 875 return error;
885 }
886 ASSERT(!xfs_buf_geterror(bp)); 876 ASSERT(!xfs_buf_geterror(bp));
887 *bpp = bp; 877 *bpp = bp;
888 return 0; 878 return 0;
@@ -1970,7 +1960,7 @@ xfs_growfs_rt(
1970 * Lock out other callers by grabbing the bitmap inode lock. 1960 * Lock out other callers by grabbing the bitmap inode lock.
1971 */ 1961 */
1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 1962 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); 1963 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1974 /* 1964 /*
1975 * Update the bitmap inode's size. 1965 * Update the bitmap inode's size.
1976 */ 1966 */
@@ -1982,7 +1972,7 @@ xfs_growfs_rt(
1982 * Get the summary inode into the transaction. 1972 * Get the summary inode into the transaction.
1983 */ 1973 */
1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); 1974 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); 1975 xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1986 /* 1976 /*
1987 * Update the summary inode's size. 1977 * Update the summary inode's size.
1988 */ 1978 */
@@ -2153,7 +2143,7 @@ xfs_rtfree_extent(
2153 * Synchronize by locking the bitmap inode. 2143 * Synchronize by locking the bitmap inode.
2154 */ 2144 */
2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 2145 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); 2146 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2157 2147
2158#if defined(__KERNEL__) && defined(DEBUG) 2148#if defined(__KERNEL__) && defined(DEBUG)
2159 /* 2149 /*
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c96a8a05ac0..597d044a09a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -92,24 +92,6 @@ xfs_do_force_shutdown(
92} 92}
93 93
94/* 94/*
95 * Prints out an ALERT message about I/O error.
96 */
97void
98xfs_ioerror_alert(
99 char *func,
100 struct xfs_mount *mp,
101 xfs_buf_t *bp,
102 xfs_daddr_t blkno)
103{
104 xfs_alert(mp,
105 "I/O error occurred: meta-data dev %s block 0x%llx"
106 " (\"%s\") error %d buf count %zd",
107 xfs_buf_target_name(bp->b_target),
108 (__uint64_t)blkno, func,
109 bp->b_error, XFS_BUF_COUNT(bp));
110}
111
112/*
113 * This isn't an absolute requirement, but it is 95 * This isn't an absolute requirement, but it is
114 * just a good idea to call xfs_read_buf instead of 96 * just a good idea to call xfs_read_buf instead of
115 * directly doing a read_buf call. For one, we shouldn't 97 * directly doing a read_buf call. For one, we shouldn't
@@ -143,14 +125,13 @@ xfs_read_buf(
143 } else { 125 } else {
144 *bpp = NULL; 126 *bpp = NULL;
145 if (error) { 127 if (error) {
146 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); 128 xfs_buf_ioerror_alert(bp, __func__);
147 } else { 129 } else {
148 error = XFS_ERROR(EIO); 130 error = XFS_ERROR(EIO);
149 } 131 }
150 if (bp) { 132 if (bp) {
151 XFS_BUF_UNDONE(bp); 133 XFS_BUF_UNDONE(bp);
152 XFS_BUF_UNDELAYWRITE(bp); 134 xfs_buf_stale(bp);
153 XFS_BUF_STALE(bp);
154 /* 135 /*
155 * brelse clears B_ERROR and b_error 136 * brelse clears B_ERROR and b_error
156 */ 137 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed7..bbdb9ad6a4b 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
43 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
44 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
45extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
46 xfs_buf_t *bp, xfs_daddr_t blkno);
47extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 45extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
48 46
49#endif /* __XFS_RW_H__ */ 47#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5cf06b85fd9..3eca58f51ae 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -796,8 +796,6 @@ xfs_fs_destroy_inode(
796 if (is_bad_inode(inode)) 796 if (is_bad_inode(inode))
797 goto out_reclaim; 797 goto out_reclaim;
798 798
799 xfs_ioend_wait(ip);
800
801 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 799 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
802 800
803 /* 801 /*
@@ -837,7 +835,6 @@ xfs_fs_inode_init_once(
837 inode_init_once(VFS_I(ip)); 835 inode_init_once(VFS_I(ip));
838 836
839 /* xfs inode */ 837 /* xfs inode */
840 atomic_set(&ip->i_iocount, 0);
841 atomic_set(&ip->i_pincount, 0); 838 atomic_set(&ip->i_pincount, 0);
842 spin_lock_init(&ip->i_flags_lock); 839 spin_lock_init(&ip->i_flags_lock);
843 init_waitqueue_head(&ip->i_ipin_wait); 840 init_waitqueue_head(&ip->i_ipin_wait);
@@ -887,7 +884,7 @@ xfs_log_inode(
887 } 884 }
888 885
889 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
890 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 887 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
891 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 888 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
892 return xfs_trans_commit(tp, 0); 889 return xfs_trans_commit(tp, 0);
893} 890}
@@ -914,9 +911,8 @@ xfs_fs_write_inode(
914 * of forcing it all the way to stable storage using a 911 * of forcing it all the way to stable storage using a
915 * synchronous transaction we let the log force inside the 912 * synchronous transaction we let the log force inside the
916 * ->sync_fs call do that for thus, which reduces the number 913 * ->sync_fs call do that for thus, which reduces the number
917 * of synchronous log foces dramatically. 914 * of synchronous log forces dramatically.
918 */ 915 */
919 xfs_ioend_wait(ip);
920 error = xfs_log_inode(ip); 916 error = xfs_log_inode(ip);
921 if (error) 917 if (error)
922 goto out; 918 goto out;
@@ -1019,7 +1015,7 @@ xfs_fs_put_super(
1019 */ 1015 */
1020 xfs_filestream_unmount(mp); 1016 xfs_filestream_unmount(mp);
1021 1017
1022 XFS_bflush(mp->m_ddev_targp); 1018 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1023 1019
1024 xfs_unmountfs(mp); 1020 xfs_unmountfs(mp);
1025 xfs_freesb(mp); 1021 xfs_freesb(mp);
@@ -1443,7 +1439,7 @@ xfs_fs_fill_super(
1443 */ 1439 */
1444 xfs_filestream_unmount(mp); 1440 xfs_filestream_unmount(mp);
1445 1441
1446 XFS_bflush(mp->m_ddev_targp); 1442 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1447 1443
1448 xfs_unmountfs(mp); 1444 xfs_unmountfs(mp);
1449 goto out_free_sb; 1445 goto out_free_sb;
@@ -1670,7 +1666,6 @@ init_xfs_fs(void)
1670 printk(KERN_INFO XFS_VERSION_STRING " with " 1666 printk(KERN_INFO XFS_VERSION_STRING " with "
1671 XFS_BUILD_OPTIONS " enabled\n"); 1667 XFS_BUILD_OPTIONS " enabled\n");
1672 1668
1673 xfs_ioend_init();
1674 xfs_dir_startup(); 1669 xfs_dir_startup();
1675 1670
1676 error = xfs_init_zones(); 1671 error = xfs_init_zones();
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 4604f90f86a..aa3dc1a4d53 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
227 int error = 0; 227 int error = 0;
228 228
229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230 goto out_wait; 230 return 0;
231 231
232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233 if (flags & SYNC_TRYLOCK) 233 if (flags & SYNC_TRYLOCK)
234 goto out_wait; 234 return 0;
235 xfs_ilock(ip, XFS_IOLOCK_SHARED); 235 xfs_ilock(ip, XFS_IOLOCK_SHARED);
236 } 236 }
237 237
238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239 0 : XBF_ASYNC, FI_NONE); 239 0 : XBF_ASYNC, FI_NONE);
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241
242 out_wait:
243 if (flags & SYNC_WAIT)
244 xfs_ioend_wait(ip);
245 return error; 241 return error;
246} 242}
247 243
@@ -322,6 +318,7 @@ xfs_sync_fsdata(
322 struct xfs_mount *mp) 318 struct xfs_mount *mp)
323{ 319{
324 struct xfs_buf *bp; 320 struct xfs_buf *bp;
321 int error;
325 322
326 /* 323 /*
327 * If the buffer is pinned then push on the log so we won't get stuck 324 * If the buffer is pinned then push on the log so we won't get stuck
@@ -334,8 +331,9 @@ xfs_sync_fsdata(
334 bp = xfs_getsb(mp, 0); 331 bp = xfs_getsb(mp, 0);
335 if (xfs_buf_ispinned(bp)) 332 if (xfs_buf_ispinned(bp))
336 xfs_log_force(mp, 0); 333 xfs_log_force(mp, 0);
337 334 error = xfs_bwrite(bp);
338 return xfs_bwrite(mp, bp); 335 xfs_buf_relse(bp);
336 return error;
339} 337}
340 338
341/* 339/*
@@ -379,7 +377,7 @@ xfs_quiesce_data(
379 377
380 /* flush data-only devices */ 378 /* flush data-only devices */
381 if (mp->m_rtdev_targp) 379 if (mp->m_rtdev_targp)
382 XFS_bflush(mp->m_rtdev_targp); 380 xfs_flush_buftarg(mp->m_rtdev_targp, 1);
383 381
384 return error ? error : error2; 382 return error ? error : error2;
385} 383}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd7..f1d2802b2f0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
30struct xfs_da_args; 30struct xfs_da_args;
31struct xfs_da_node_entry; 31struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xfs_log_item;
33struct xlog_ticket; 34struct xlog_ticket;
34struct log; 35struct log;
35struct xlog_recover; 36struct xlog_recover;
@@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
320DEFINE_BUF_EVENT(xfs_buf_iodone); 321DEFINE_BUF_EVENT(xfs_buf_iodone);
321DEFINE_BUF_EVENT(xfs_buf_iorequest); 322DEFINE_BUF_EVENT(xfs_buf_iorequest);
322DEFINE_BUF_EVENT(xfs_buf_bawrite); 323DEFINE_BUF_EVENT(xfs_buf_bawrite);
323DEFINE_BUF_EVENT(xfs_buf_bdwrite);
324DEFINE_BUF_EVENT(xfs_buf_lock); 324DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_trylock); 326DEFINE_BUF_EVENT(xfs_buf_trylock);
@@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
577DEFINE_INODE_EVENT(xfs_file_ioctl); 577DEFINE_INODE_EVENT(xfs_file_ioctl);
578DEFINE_INODE_EVENT(xfs_file_compat_ioctl); 578DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
579DEFINE_INODE_EVENT(xfs_ioctl_setattr); 579DEFINE_INODE_EVENT(xfs_ioctl_setattr);
580DEFINE_INODE_EVENT(xfs_dir_fsync);
580DEFINE_INODE_EVENT(xfs_file_fsync); 581DEFINE_INODE_EVENT(xfs_file_fsync);
581DEFINE_INODE_EVENT(xfs_destroy_inode); 582DEFINE_INODE_EVENT(xfs_destroy_inode);
582DEFINE_INODE_EVENT(xfs_write_inode); 583DEFINE_INODE_EVENT(xfs_write_inode);
@@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
853DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); 854DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
854DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); 855DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
855 856
857DECLARE_EVENT_CLASS(xfs_log_item_class,
858 TP_PROTO(struct xfs_log_item *lip),
859 TP_ARGS(lip),
860 TP_STRUCT__entry(
861 __field(dev_t, dev)
862 __field(void *, lip)
863 __field(uint, type)
864 __field(uint, flags)
865 __field(xfs_lsn_t, lsn)
866 ),
867 TP_fast_assign(
868 __entry->dev = lip->li_mountp->m_super->s_dev;
869 __entry->lip = lip;
870 __entry->type = lip->li_type;
871 __entry->flags = lip->li_flags;
872 __entry->lsn = lip->li_lsn;
873 ),
874 TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
875 MAJOR(__entry->dev), MINOR(__entry->dev),
876 __entry->lip,
877 CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
878 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
879 __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
880)
881
882#define DEFINE_LOG_ITEM_EVENT(name) \
883DEFINE_EVENT(xfs_log_item_class, name, \
884 TP_PROTO(struct xfs_log_item *lip), \
885 TP_ARGS(lip))
886DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
887DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
888DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
889DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
890DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
891
892
856DECLARE_EVENT_CLASS(xfs_file_class, 893DECLARE_EVENT_CLASS(xfs_file_class,
857 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), 894 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
858 TP_ARGS(ip, count, offset, flags), 895 TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b..1f35b2feca9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
1790} 1790}
1791 1791
1792/* 1792/*
1793 * xfs_trans_commit 1793 * Commit the given transaction to the log.
1794 *
1795 * Commit the given transaction to the log a/synchronously.
1796 * 1794 *
1797 * XFS disk error handling mechanism is not based on a typical 1795 * XFS disk error handling mechanism is not based on a typical
1798 * transaction abort mechanism. Logically after the filesystem 1796 * transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
1804 * Do not reference the transaction structure after this call. 1802 * Do not reference the transaction structure after this call.
1805 */ 1803 */
1806int 1804int
1807_xfs_trans_commit( 1805xfs_trans_commit(
1808 struct xfs_trans *tp, 1806 struct xfs_trans *tp,
1809 uint flags, 1807 uint flags)
1810 int *log_flushed)
1811{ 1808{
1812 struct xfs_mount *mp = tp->t_mountp; 1809 struct xfs_mount *mp = tp->t_mountp;
1813 xfs_lsn_t commit_lsn = -1; 1810 xfs_lsn_t commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
1866 if (sync) { 1863 if (sync) {
1867 if (!error) { 1864 if (!error) {
1868 error = _xfs_log_force_lsn(mp, commit_lsn, 1865 error = _xfs_log_force_lsn(mp, commit_lsn,
1869 XFS_LOG_SYNC, log_flushed); 1866 XFS_LOG_SYNC, NULL);
1870 } 1867 }
1871 XFS_STATS_INC(xs_trans_sync); 1868 XFS_STATS_INC(xs_trans_sync);
1872 } else { 1869 } else {
@@ -2021,6 +2018,6 @@ xfs_trans_roll(
2021 if (error) 2018 if (error)
2022 return error; 2019 return error;
2023 2020
2024 xfs_trans_ijoin(trans, dp); 2021 xfs_trans_ijoin(trans, dp, 0);
2025 return 0; 2022 return 0;
2026} 2023}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 53597f4db9b..3ae713c0abd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
326 struct xfs_log_item *); 326 struct xfs_log_item *);
327 /* buffer item iodone */ 327 /* buffer item iodone */
328 /* callback func */ 328 /* callback func */
329 struct xfs_item_ops *li_ops; /* function list */ 329 const struct xfs_item_ops *li_ops; /* function list */
330 330
331 /* delayed logging */ 331 /* delayed logging */
332 struct list_head li_cil; /* CIL pointers */ 332 struct list_head li_cil; /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
341 { XFS_LI_IN_AIL, "IN_AIL" }, \ 341 { XFS_LI_IN_AIL, "IN_AIL" }, \
342 { XFS_LI_ABORTED, "ABORTED" } 342 { XFS_LI_ABORTED, "ABORTED" }
343 343
344typedef struct xfs_item_ops { 344struct xfs_item_ops {
345 uint (*iop_size)(xfs_log_item_t *); 345 uint (*iop_size)(xfs_log_item_t *);
346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
347 void (*iop_pin)(xfs_log_item_t *); 347 void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
352 void (*iop_push)(xfs_log_item_t *); 352 void (*iop_push)(xfs_log_item_t *);
353 bool (*iop_pushbuf)(xfs_log_item_t *); 353 bool (*iop_pushbuf)(xfs_log_item_t *);
354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
355} xfs_item_ops_t; 355};
356 356
357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
@@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
475void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 474void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
476void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); 475void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
477struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); 476struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -487,10 +486,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
487 struct xfs_efd_log_item *, 486 struct xfs_efd_log_item *,
488 xfs_fsblock_t, 487 xfs_fsblock_t,
489 xfs_extlen_t); 488 xfs_extlen_t);
490int _xfs_trans_commit(xfs_trans_t *, 489int xfs_trans_commit(xfs_trans_t *, uint flags);
491 uint flags,
492 int *);
493#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
494void xfs_trans_cancel(xfs_trans_t *, int); 490void xfs_trans_cancel(xfs_trans_t *, int);
495int xfs_trans_ail_init(struct xfs_mount *); 491int xfs_trans_ail_init(struct xfs_mount *);
496void xfs_trans_ail_destroy(struct xfs_mount *); 492void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 3a1e7ca54c2..ed9252bcdac 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,6 +26,7 @@
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_trace.h"
29#include "xfs_error.h" 30#include "xfs_error.h"
30 31
31#ifdef DEBUG 32#ifdef DEBUG
@@ -364,12 +365,24 @@ xfsaild_push(
364 xfs_lsn_t lsn; 365 xfs_lsn_t lsn;
365 xfs_lsn_t target; 366 xfs_lsn_t target;
366 long tout = 10; 367 long tout = 10;
367 int flush_log = 0;
368 int stuck = 0; 368 int stuck = 0;
369 int count = 0; 369 int count = 0;
370 int push_xfsbufd = 0; 370 int push_xfsbufd = 0;
371 371
372 /*
373 * If last time we ran we encountered pinned items, force the log first
374 * and wait for it before pushing again.
375 */
372 spin_lock(&ailp->xa_lock); 376 spin_lock(&ailp->xa_lock);
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
378 !list_empty(&ailp->xa_ail)) {
379 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock);
381 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 }
385
373 target = ailp->xa_target; 386 target = ailp->xa_target;
374 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
375 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 388 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -413,16 +426,20 @@ xfsaild_push(
413 switch (lock_result) { 426 switch (lock_result) {
414 case XFS_ITEM_SUCCESS: 427 case XFS_ITEM_SUCCESS:
415 XFS_STATS_INC(xs_push_ail_success); 428 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip);
430
416 IOP_PUSH(lip); 431 IOP_PUSH(lip);
417 ailp->xa_last_pushed_lsn = lsn; 432 ailp->xa_last_pushed_lsn = lsn;
418 break; 433 break;
419 434
420 case XFS_ITEM_PUSHBUF: 435 case XFS_ITEM_PUSHBUF:
421 XFS_STATS_INC(xs_push_ail_pushbuf); 436 XFS_STATS_INC(xs_push_ail_pushbuf);
437 trace_xfs_ail_pushbuf(lip);
422 438
423 if (!IOP_PUSHBUF(lip)) { 439 if (!IOP_PUSHBUF(lip)) {
440 trace_xfs_ail_pushbuf_pinned(lip);
424 stuck++; 441 stuck++;
425 flush_log = 1; 442 ailp->xa_log_flush++;
426 } else { 443 } else {
427 ailp->xa_last_pushed_lsn = lsn; 444 ailp->xa_last_pushed_lsn = lsn;
428 } 445 }
@@ -431,12 +448,15 @@ xfsaild_push(
431 448
432 case XFS_ITEM_PINNED: 449 case XFS_ITEM_PINNED:
433 XFS_STATS_INC(xs_push_ail_pinned); 450 XFS_STATS_INC(xs_push_ail_pinned);
451 trace_xfs_ail_pinned(lip);
452
434 stuck++; 453 stuck++;
435 flush_log = 1; 454 ailp->xa_log_flush++;
436 break; 455 break;
437 456
438 case XFS_ITEM_LOCKED: 457 case XFS_ITEM_LOCKED:
439 XFS_STATS_INC(xs_push_ail_locked); 458 XFS_STATS_INC(xs_push_ail_locked);
459 trace_xfs_ail_locked(lip);
440 stuck++; 460 stuck++;
441 break; 461 break;
442 462
@@ -476,16 +496,6 @@ xfsaild_push(
476 xfs_trans_ail_cursor_done(ailp, &cur); 496 xfs_trans_ail_cursor_done(ailp, &cur);
477 spin_unlock(&ailp->xa_lock); 497 spin_unlock(&ailp->xa_lock);
478 498
479 if (flush_log) {
480 /*
481 * If something we need to push out was pinned, then
482 * push out the log so it will become unpinned and
483 * move forward in the AIL.
484 */
485 XFS_STATS_INC(xs_push_ail_flush);
486 xfs_log_force(mp, 0);
487 }
488
489 if (push_xfsbufd) { 499 if (push_xfsbufd) {
490 /* we've got delayed write buffers to flush */ 500 /* we've got delayed write buffers to flush */
491 wake_up_process(mp->m_ddev_targp->bt_task); 501 wake_up_process(mp->m_ddev_targp->bt_task);
@@ -496,6 +506,7 @@ out_done:
496 if (!count) { 506 if (!count) {
497 /* We're past our target or empty, so idle */ 507 /* We're past our target or empty, so idle */
498 ailp->xa_last_pushed_lsn = 0; 508 ailp->xa_last_pushed_lsn = 0;
509 ailp->xa_log_flush = 0;
499 510
500 tout = 50; 511 tout = 50;
501 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 512 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
@@ -514,9 +525,13 @@ out_done:
514 * were stuck. 525 * were stuck.
515 * 526 *
516 * Backoff a bit more to allow some I/O to complete before 527 * Backoff a bit more to allow some I/O to complete before
517 * continuing from where we were. 528 * restarting from the start of the AIL. This prevents us
529 * from spinning on the same items, and if they are pinned will
530 * all the restart to issue a log force to unpin the stuck
531 * items.
518 */ 532 */
519 tout = 20; 533 tout = 20;
534 ailp->xa_last_pushed_lsn = 0;
520 } 535 }
521 536
522 return tout; 537 return tout;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 137e2b9e294..475a4ded4f4 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -160,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp,
160 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); 160 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
161 if (bp != NULL) { 161 if (bp != NULL) {
162 ASSERT(xfs_buf_islocked(bp)); 162 ASSERT(xfs_buf_islocked(bp));
163 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 163 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
164 XFS_BUF_SUPER_STALE(bp); 164 xfs_buf_stale(bp);
165 XFS_BUF_DONE(bp);
166 }
165 167
166 /* 168 /*
167 * If the buffer is stale then it was binval'ed 169 * If the buffer is stale then it was binval'ed
@@ -294,8 +296,7 @@ xfs_trans_read_buf(
294 296
295 if (bp->b_error) { 297 if (bp->b_error) {
296 error = bp->b_error; 298 error = bp->b_error;
297 xfs_ioerror_alert("xfs_trans_read_buf", mp, 299 xfs_buf_ioerror_alert(bp, __func__);
298 bp, blkno);
299 xfs_buf_relse(bp); 300 xfs_buf_relse(bp);
300 return error; 301 return error;
301 } 302 }
@@ -337,8 +338,7 @@ xfs_trans_read_buf(
337 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
338 error = xfs_buf_iowait(bp); 339 error = xfs_buf_iowait(bp);
339 if (error) { 340 if (error) {
340 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_buf_ioerror_alert(bp, __func__);
341 bp, blkno);
342 xfs_buf_relse(bp); 342 xfs_buf_relse(bp);
343 /* 343 /*
344 * We can gracefully recover from most read 344 * We can gracefully recover from most read
@@ -387,9 +387,9 @@ xfs_trans_read_buf(
387 } 387 }
388 if (bp->b_error) { 388 if (bp->b_error) {
389 error = bp->b_error; 389 error = bp->b_error;
390 XFS_BUF_SUPER_STALE(bp); 390 xfs_buf_stale(bp);
391 xfs_ioerror_alert("xfs_trans_read_buf", mp, 391 XFS_BUF_DONE(bp);
392 bp, blkno); 392 xfs_buf_ioerror_alert(bp, __func__);
393 if (tp->t_flags & XFS_TRANS_DIRTY) 393 if (tp->t_flags & XFS_TRANS_DIRTY)
394 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 394 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
395 xfs_buf_relse(bp); 395 xfs_buf_relse(bp);
@@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
643 * inside the b_bdstrat callback so that this won't get written to 643 * inside the b_bdstrat callback so that this won't get written to
644 * disk. 644 * disk.
645 */ 645 */
646 XFS_BUF_DELAYWRITE(bp);
647 XFS_BUF_DONE(bp); 646 XFS_BUF_DONE(bp);
648 647
649 ASSERT(atomic_read(&bip->bli_refcount) > 0); 648 ASSERT(atomic_read(&bip->bli_refcount) > 0);
650 bp->b_iodone = xfs_buf_iodone_callbacks; 649 bp->b_iodone = xfs_buf_iodone_callbacks;
651 bip->bli_item.li_cb = xfs_buf_iodone; 650 bip->bli_item.li_cb = xfs_buf_iodone;
652 651
652 xfs_buf_delwri_queue(bp);
653
653 trace_xfs_trans_log_buf(bip); 654 trace_xfs_trans_log_buf(bip);
654 655
655 /* 656 /*
@@ -738,8 +739,7 @@ xfs_trans_binval(
738 * We set the stale bit in the buffer as well since we're getting 739 * We set the stale bit in the buffer as well since we're getting
739 * rid of it. 740 * rid of it.
740 */ 741 */
741 XFS_BUF_UNDELAYWRITE(bp); 742 xfs_buf_stale(bp);
742 XFS_BUF_STALE(bp);
743 bip->bli_flags |= XFS_BLI_STALE; 743 bip->bli_flags |= XFS_BLI_STALE;
744 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 744 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
745 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 745 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e6..32f0288ae10 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
47 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
48 * 48 *
49 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
50 * If lock_flags is non-zero the inode will be unlocked on transaction commit.
50 */ 51 */
51void 52void
52xfs_trans_ijoin( 53xfs_trans_ijoin(
53 struct xfs_trans *tp, 54 struct xfs_trans *tp,
54 struct xfs_inode *ip) 55 struct xfs_inode *ip,
56 uint lock_flags)
55{ 57{
56 xfs_inode_log_item_t *iip; 58 xfs_inode_log_item_t *iip;
57 59
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
59 if (ip->i_itemp == NULL) 61 if (ip->i_itemp == NULL)
60 xfs_inode_item_init(ip, ip->i_mount); 62 xfs_inode_item_init(ip, ip->i_mount);
61 iip = ip->i_itemp; 63 iip = ip->i_itemp;
64
62 ASSERT(iip->ili_lock_flags == 0); 65 ASSERT(iip->ili_lock_flags == 0);
66 iip->ili_lock_flags = lock_flags;
63 67
64 /* 68 /*
65 * Get a log_item_desc to point at the new item. 69 * Get a log_item_desc to point at the new item.
@@ -70,25 +74,6 @@ xfs_trans_ijoin(
70} 74}
71 75
72/* 76/*
73 * Add a locked inode to the transaction.
74 *
75 *
76 * Grabs a reference to the inode which will be dropped when the transaction
77 * is committed. The inode will also be unlocked at that point. The inode
78 * must be locked, and it cannot be associated with any transaction.
79 */
80void
81xfs_trans_ijoin_ref(
82 struct xfs_trans *tp,
83 struct xfs_inode *ip,
84 uint lock_flags)
85{
86 xfs_trans_ijoin(tp, ip);
87 IHOLD(ip);
88 ip->i_itemp->ili_lock_flags = lock_flags;
89}
90
91/*
92 * Transactional inode timestamp update. Requires the inode to be locked and 77 * Transactional inode timestamp update. Requires the inode to be locked and
93 * joined to the transaction supplied. Relies on the transaction subsystem to 78 * joined to the transaction supplied. Relies on the transaction subsystem to
94 * track dirty state and update/writeback the inode accordingly. 79 * track dirty state and update/writeback the inode accordingly.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 22750b5e4a8..44820b9fcb4 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -70,6 +70,7 @@ struct xfs_ail {
70 struct list_head xa_cursors; 70 struct list_head xa_cursors;
71 spinlock_t xa_lock; 71 spinlock_t xa_lock;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 int xa_log_flush;
73}; 74};
74 75
75/* 76/*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51fc429527b..ce9268a2f56 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
72 xfs_buf_t *bp; 72 xfs_buf_t *bp;
73 int error = 0; 73 int error = 0;
74 74
75 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, 75 error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
76 mval, &nmaps, NULL); 76 0);
77 if (error) 77 if (error)
78 goto out; 78 goto out;
79 79
@@ -87,8 +87,7 @@ xfs_readlink_bmap(
87 return XFS_ERROR(ENOMEM); 87 return XFS_ERROR(ENOMEM);
88 error = bp->b_error; 88 error = bp->b_error;
89 if (error) { 89 if (error) {
90 xfs_ioerror_alert("xfs_readlink", 90 xfs_buf_ioerror_alert(bp, __func__);
91 ip->i_mount, bp, XFS_BUF_ADDR(bp));
92 xfs_buf_relse(bp); 91 xfs_buf_relse(bp);
93 goto out; 92 goto out;
94 } 93 }
@@ -113,7 +112,7 @@ xfs_readlink(
113 char *link) 112 char *link)
114{ 113{
115 xfs_mount_t *mp = ip->i_mount; 114 xfs_mount_t *mp = ip->i_mount;
116 int pathlen; 115 xfs_fsize_t pathlen;
117 int error = 0; 116 int error = 0;
118 117
119 trace_xfs_readlink(ip); 118 trace_xfs_readlink(ip);
@@ -123,13 +122,19 @@ xfs_readlink(
123 122
124 xfs_ilock(ip, XFS_ILOCK_SHARED); 123 xfs_ilock(ip, XFS_ILOCK_SHARED);
125 124
126 ASSERT(S_ISLNK(ip->i_d.di_mode));
127 ASSERT(ip->i_d.di_size <= MAXPATHLEN);
128
129 pathlen = ip->i_d.di_size; 125 pathlen = ip->i_d.di_size;
130 if (!pathlen) 126 if (!pathlen)
131 goto out; 127 goto out;
132 128
129 if (pathlen < 0 || pathlen > MAXPATHLEN) {
130 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
131 __func__, (unsigned long long) ip->i_ino,
132 (long long) pathlen);
133 ASSERT(0);
134 return XFS_ERROR(EFSCORRUPTED);
135 }
136
137
133 if (ip->i_df.if_flags & XFS_IFINLINE) { 138 if (ip->i_df.if_flags & XFS_IFINLINE) {
134 memcpy(link, ip->i_df.if_u1.if_data, pathlen); 139 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
135 link[pathlen] = '\0'; 140 link[pathlen] = '\0';
@@ -178,8 +183,7 @@ xfs_free_eofblocks(
178 183
179 nimaps = 1; 184 nimaps = 1;
180 xfs_ilock(ip, XFS_ILOCK_SHARED); 185 xfs_ilock(ip, XFS_ILOCK_SHARED);
181 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, 186 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
182 NULL, 0, &imap, &nimaps, NULL);
183 xfs_iunlock(ip, XFS_ILOCK_SHARED); 187 xfs_iunlock(ip, XFS_ILOCK_SHARED);
184 188
185 if (!error && (nimaps != 0) && 189 if (!error && (nimaps != 0) &&
@@ -220,7 +224,7 @@ xfs_free_eofblocks(
220 } 224 }
221 225
222 xfs_ilock(ip, XFS_ILOCK_EXCL); 226 xfs_ilock(ip, XFS_ILOCK_EXCL);
223 xfs_trans_ijoin(tp, ip); 227 xfs_trans_ijoin(tp, ip, 0);
224 228
225 error = xfs_itruncate_data(&tp, ip, ip->i_size); 229 error = xfs_itruncate_data(&tp, ip, ip->i_size);
226 if (error) { 230 if (error) {
@@ -289,7 +293,7 @@ xfs_inactive_symlink_rmt(
289 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 293 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
290 size = (int)ip->i_d.di_size; 294 size = (int)ip->i_d.di_size;
291 ip->i_d.di_size = 0; 295 ip->i_d.di_size = 0;
292 xfs_trans_ijoin(tp, ip); 296 xfs_trans_ijoin(tp, ip, 0);
293 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 297 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
294 /* 298 /*
295 * Find the block(s) so we can inval and unmap them. 299 * Find the block(s) so we can inval and unmap them.
@@ -297,9 +301,9 @@ xfs_inactive_symlink_rmt(
297 done = 0; 301 done = 0;
298 xfs_bmap_init(&free_list, &first_block); 302 xfs_bmap_init(&free_list, &first_block);
299 nmaps = ARRAY_SIZE(mval); 303 nmaps = ARRAY_SIZE(mval);
300 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 304 error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
301 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 305 mval, &nmaps, 0);
302 &free_list))) 306 if (error)
303 goto error0; 307 goto error0;
304 /* 308 /*
305 * Invalidate the block(s). 309 * Invalidate the block(s).
@@ -308,6 +312,10 @@ xfs_inactive_symlink_rmt(
308 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 312 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
309 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), 313 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
310 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); 314 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
315 if (!bp) {
316 error = ENOMEM;
317 goto error1;
318 }
311 xfs_trans_binval(tp, bp); 319 xfs_trans_binval(tp, bp);
312 } 320 }
313 /* 321 /*
@@ -333,7 +341,7 @@ xfs_inactive_symlink_rmt(
333 * Mark it dirty so it will be logged and moved forward in the log as 341 * Mark it dirty so it will be logged and moved forward in the log as
334 * part of every commit. 342 * part of every commit.
335 */ 343 */
336 xfs_trans_ijoin(tp, ip); 344 xfs_trans_ijoin(tp, ip, 0);
337 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 345 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
338 /* 346 /*
339 * Get a new, empty transaction to return to our caller. 347 * Get a new, empty transaction to return to our caller.
@@ -466,7 +474,7 @@ xfs_inactive_attrs(
466 goto error_cancel; 474 goto error_cancel;
467 475
468 xfs_ilock(ip, XFS_ILOCK_EXCL); 476 xfs_ilock(ip, XFS_ILOCK_EXCL);
469 xfs_trans_ijoin(tp, ip); 477 xfs_trans_ijoin(tp, ip, 0);
470 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 478 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
471 479
472 ASSERT(ip->i_d.di_anextents == 0); 480 ASSERT(ip->i_d.di_anextents == 0);
@@ -647,8 +655,6 @@ xfs_inactive(
647 if (truncate) { 655 if (truncate) {
648 xfs_ilock(ip, XFS_IOLOCK_EXCL); 656 xfs_ilock(ip, XFS_IOLOCK_EXCL);
649 657
650 xfs_ioend_wait(ip);
651
652 error = xfs_trans_reserve(tp, 0, 658 error = xfs_trans_reserve(tp, 0,
653 XFS_ITRUNCATE_LOG_RES(mp), 659 XFS_ITRUNCATE_LOG_RES(mp),
654 0, XFS_TRANS_PERM_LOG_RES, 660 0, XFS_TRANS_PERM_LOG_RES,
@@ -662,7 +668,7 @@ xfs_inactive(
662 } 668 }
663 669
664 xfs_ilock(ip, XFS_ILOCK_EXCL); 670 xfs_ilock(ip, XFS_ILOCK_EXCL);
665 xfs_trans_ijoin(tp, ip); 671 xfs_trans_ijoin(tp, ip, 0);
666 672
667 error = xfs_itruncate_data(&tp, ip, 0); 673 error = xfs_itruncate_data(&tp, ip, 0);
668 if (error) { 674 if (error) {
@@ -686,7 +692,7 @@ xfs_inactive(
686 return VN_INACTIVE_CACHE; 692 return VN_INACTIVE_CACHE;
687 } 693 }
688 694
689 xfs_trans_ijoin(tp, ip); 695 xfs_trans_ijoin(tp, ip, 0);
690 } else { 696 } else {
691 error = xfs_trans_reserve(tp, 0, 697 error = xfs_trans_reserve(tp, 0,
692 XFS_IFREE_LOG_RES(mp), 698 XFS_IFREE_LOG_RES(mp),
@@ -699,7 +705,7 @@ xfs_inactive(
699 } 705 }
700 706
701 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 707 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
702 xfs_trans_ijoin(tp, ip); 708 xfs_trans_ijoin(tp, ip, 0);
703 } 709 }
704 710
705 /* 711 /*
@@ -939,7 +945,7 @@ xfs_create(
939 * the transaction cancel unlocking dp so don't do it explicitly in the 945 * the transaction cancel unlocking dp so don't do it explicitly in the
940 * error path. 946 * error path.
941 */ 947 */
942 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 948 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
943 unlock_dp_on_error = B_FALSE; 949 unlock_dp_on_error = B_FALSE;
944 950
945 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 951 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1260,8 +1266,8 @@ xfs_remove(
1260 1266
1261 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 1267 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1262 1268
1263 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 1269 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1264 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1270 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1265 1271
1266 /* 1272 /*
1267 * If we're removing a directory perform some additional validation. 1273 * If we're removing a directory perform some additional validation.
@@ -1406,8 +1412,8 @@ xfs_link(
1406 1412
1407 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1413 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1408 1414
1409 xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); 1415 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1410 xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); 1416 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1411 1417
1412 /* 1418 /*
1413 * If the source has too many links, we can't make any more to it. 1419 * If the source has too many links, we can't make any more to it.
@@ -1601,7 +1607,7 @@ xfs_symlink(
1601 * transaction cancel unlocking dp so don't do it explicitly in the 1607 * transaction cancel unlocking dp so don't do it explicitly in the
1602 * error path. 1608 * error path.
1603 */ 1609 */
1604 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 1610 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1605 unlock_dp_on_error = B_FALSE; 1611 unlock_dp_on_error = B_FALSE;
1606 1612
1607 /* 1613 /*
@@ -1632,10 +1638,9 @@ xfs_symlink(
1632 first_fsb = 0; 1638 first_fsb = 0;
1633 nmaps = SYMLINK_MAPS; 1639 nmaps = SYMLINK_MAPS;
1634 1640
1635 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, 1641 error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
1636 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 1642 XFS_BMAPI_METADATA, &first_block, resblks,
1637 &first_block, resblks, mval, &nmaps, 1643 mval, &nmaps, &free_list);
1638 &free_list);
1639 if (error) 1644 if (error)
1640 goto error2; 1645 goto error2;
1641 1646
@@ -1650,7 +1655,10 @@ xfs_symlink(
1650 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 1655 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1651 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 1656 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1652 BTOBB(byte_cnt), 0); 1657 BTOBB(byte_cnt), 0);
1653 ASSERT(!xfs_buf_geterror(bp)); 1658 if (!bp) {
1659 error = ENOMEM;
1660 goto error2;
1661 }
1654 if (pathlen < byte_cnt) { 1662 if (pathlen < byte_cnt) {
1655 byte_cnt = pathlen; 1663 byte_cnt = pathlen;
1656 } 1664 }
@@ -1732,7 +1740,7 @@ xfs_set_dmattrs(
1732 return error; 1740 return error;
1733 } 1741 }
1734 xfs_ilock(ip, XFS_ILOCK_EXCL); 1742 xfs_ilock(ip, XFS_ILOCK_EXCL);
1735 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1743 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1736 1744
1737 ip->i_d.di_dmevmask = evmask; 1745 ip->i_d.di_dmevmask = evmask;
1738 ip->i_d.di_dmstate = state; 1746 ip->i_d.di_dmstate = state;
@@ -1778,7 +1786,6 @@ xfs_alloc_file_space(
1778 xfs_fileoff_t startoffset_fsb; 1786 xfs_fileoff_t startoffset_fsb;
1779 xfs_fsblock_t firstfsb; 1787 xfs_fsblock_t firstfsb;
1780 int nimaps; 1788 int nimaps;
1781 int bmapi_flag;
1782 int quota_flag; 1789 int quota_flag;
1783 int rt; 1790 int rt;
1784 xfs_trans_t *tp; 1791 xfs_trans_t *tp;
@@ -1806,7 +1813,6 @@ xfs_alloc_file_space(
1806 count = len; 1813 count = len;
1807 imapp = &imaps[0]; 1814 imapp = &imaps[0];
1808 nimaps = 1; 1815 nimaps = 1;
1809 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
1810 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 1816 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1811 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 1817 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1812 1818
@@ -1877,16 +1883,12 @@ xfs_alloc_file_space(
1877 if (error) 1883 if (error)
1878 goto error1; 1884 goto error1;
1879 1885
1880 xfs_trans_ijoin(tp, ip); 1886 xfs_trans_ijoin(tp, ip, 0);
1881 1887
1882 /*
1883 * Issue the xfs_bmapi() call to allocate the blocks
1884 */
1885 xfs_bmap_init(&free_list, &firstfsb); 1888 xfs_bmap_init(&free_list, &firstfsb);
1886 error = xfs_bmapi(tp, ip, startoffset_fsb, 1889 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1887 allocatesize_fsb, bmapi_flag, 1890 allocatesize_fsb, alloc_type, &firstfsb,
1888 &firstfsb, 0, imapp, &nimaps, 1891 0, imapp, &nimaps, &free_list);
1889 &free_list);
1890 if (error) { 1892 if (error) {
1891 goto error0; 1893 goto error0;
1892 } 1894 }
@@ -1976,8 +1978,7 @@ xfs_zero_remaining_bytes(
1976 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1978 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1977 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1979 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1978 nimap = 1; 1980 nimap = 1;
1979 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, 1981 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1980 NULL, 0, &imap, &nimap, NULL);
1981 if (error || nimap < 1) 1982 if (error || nimap < 1)
1982 break; 1983 break;
1983 ASSERT(imap.br_blockcount >= 1); 1984 ASSERT(imap.br_blockcount >= 1);
@@ -1997,8 +1998,8 @@ xfs_zero_remaining_bytes(
1997 xfsbdstrat(mp, bp); 1998 xfsbdstrat(mp, bp);
1998 error = xfs_buf_iowait(bp); 1999 error = xfs_buf_iowait(bp);
1999 if (error) { 2000 if (error) {
2000 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2001 xfs_buf_ioerror_alert(bp,
2001 mp, bp, XFS_BUF_ADDR(bp)); 2002 "xfs_zero_remaining_bytes(read)");
2002 break; 2003 break;
2003 } 2004 }
2004 memset(bp->b_addr + 2005 memset(bp->b_addr +
@@ -2010,8 +2011,8 @@ xfs_zero_remaining_bytes(
2010 xfsbdstrat(mp, bp); 2011 xfsbdstrat(mp, bp);
2011 error = xfs_buf_iowait(bp); 2012 error = xfs_buf_iowait(bp);
2012 if (error) { 2013 if (error) {
2013 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2014 xfs_buf_ioerror_alert(bp,
2014 mp, bp, XFS_BUF_ADDR(bp)); 2015 "xfs_zero_remaining_bytes(write)");
2015 break; 2016 break;
2016 } 2017 }
2017 } 2018 }
@@ -2076,7 +2077,7 @@ xfs_free_file_space(
2076 if (need_iolock) { 2077 if (need_iolock) {
2077 xfs_ilock(ip, XFS_IOLOCK_EXCL); 2078 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2078 /* wait for the completion of any pending DIOs */ 2079 /* wait for the completion of any pending DIOs */
2079 xfs_ioend_wait(ip); 2080 inode_dio_wait(VFS_I(ip));
2080 } 2081 }
2081 2082
2082 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 2083 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -2096,8 +2097,8 @@ xfs_free_file_space(
2096 */ 2097 */
2097 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 2098 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2098 nimap = 1; 2099 nimap = 1;
2099 error = xfs_bmapi(NULL, ip, startoffset_fsb, 2100 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
2100 1, 0, NULL, 0, &imap, &nimap, NULL); 2101 &imap, &nimap, 0);
2101 if (error) 2102 if (error)
2102 goto out_unlock_iolock; 2103 goto out_unlock_iolock;
2103 ASSERT(nimap == 0 || nimap == 1); 2104 ASSERT(nimap == 0 || nimap == 1);
@@ -2111,8 +2112,8 @@ xfs_free_file_space(
2111 startoffset_fsb += mp->m_sb.sb_rextsize - mod; 2112 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2112 } 2113 }
2113 nimap = 1; 2114 nimap = 1;
2114 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 2115 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
2115 1, 0, NULL, 0, &imap, &nimap, NULL); 2116 &imap, &nimap, 0);
2116 if (error) 2117 if (error)
2117 goto out_unlock_iolock; 2118 goto out_unlock_iolock;
2118 ASSERT(nimap == 0 || nimap == 1); 2119 ASSERT(nimap == 0 || nimap == 1);
@@ -2180,7 +2181,7 @@ xfs_free_file_space(
2180 if (error) 2181 if (error)
2181 goto error1; 2182 goto error1;
2182 2183
2183 xfs_trans_ijoin(tp, ip); 2184 xfs_trans_ijoin(tp, ip, 0);
2184 2185
2185 /* 2186 /*
2186 * issue the bunmapi() call to free the blocks 2187 * issue the bunmapi() call to free the blocks
@@ -2353,8 +2354,7 @@ xfs_change_file_space(
2353 } 2354 }
2354 2355
2355 xfs_ilock(ip, XFS_ILOCK_EXCL); 2356 xfs_ilock(ip, XFS_ILOCK_EXCL);
2356 2357 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2357 xfs_trans_ijoin(tp, ip);
2358 2358
2359 if ((attr_flags & XFS_ATTR_DMI) == 0) { 2359 if ((attr_flags & XFS_ATTR_DMI) == 0) {
2360 ip->i_d.di_mode &= ~S_ISUID; 2360 ip->i_d.di_mode &= ~S_ISUID;
@@ -2379,10 +2379,5 @@ xfs_change_file_space(
2379 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2379 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2380 if (attr_flags & XFS_ATTR_SYNC) 2380 if (attr_flags & XFS_ATTR_SYNC)
2381 xfs_trans_set_sync(tp); 2381 xfs_trans_set_sync(tp);
2382 2382 return xfs_trans_commit(tp, 0);
2383 error = xfs_trans_commit(tp, 0);
2384
2385 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2386
2387 return error;
2388} 2383}