aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2012-10-19 10:54:24 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2012-10-19 10:55:09 -0400
commit4533d86270d7986e00594495dde9a109d6be27ae (patch)
treec2473cac653f7b98e5bd5e6475e63734be4b7644 /fs
parent21c5e50e15b1abd797e62f18fd7f90b9cc004cbd (diff)
parent5bc66170dc486556a1e36fd384463536573f4b82 (diff)
Merge commit '5bc66170dc486556a1e36fd384463536573f4b82' into x86/urgent
From Borislav Petkov <bp@amd64.org>: Below is a RAS fix which reverts the addition of a sysfs attribute which we agreed is not needed, post-factum. And this should go in now because that sysfs attribute is going to end up in 3.7 otherwise and thus exposed to userspace; removing it then would be a lot harder. This is done as a merge rather than a simple patch/cherry-pick since the baseline for this patch was not in the previous x86/urgent. Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c8
-rw-r--r--fs/9p/v9fs.c35
-rw-r--r--fs/9p/vfs_file.c1
-rw-r--r--fs/9p/vfs_inode.c8
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/adfs.h4
-rw-r--r--fs/adfs/inode.c4
-rw-r--r--fs/adfs/super.c26
-rw-r--r--fs/affs/affs.h4
-rw-r--r--fs/affs/inode.c20
-rw-r--r--fs/affs/super.c25
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/server.c10
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs4/dev-ioctl.c18
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/autofs4/waitq.c3
-rw-r--r--fs/befs/befs.h4
-rw-r--r--fs/befs/linuxvfs.c32
-rw-r--r--fs/bfs/inode.c13
-rw-r--r--fs/binfmt_aout.c54
-rw-r--r--fs/binfmt_elf.c173
-rw-r--r--fs/binfmt_elf_fdpic.c9
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/bio-integrity.c44
-rw-r--r--fs/bio.c231
-rw-r--r--fs/block_dev.c68
-rw-r--r--fs/btrfs/acl.c8
-rw-r--r--fs/btrfs/backref.c299
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/check-integrity.c16
-rw-r--r--fs/btrfs/compression.c13
-rw-r--r--fs/btrfs/ctree.c148
-rw-r--r--fs/btrfs/ctree.h111
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/disk-io.c230
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c376
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/extent_map.c55
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c449
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode.c403
-rw-r--r--fs/btrfs/ioctl.c140
-rw-r--r--fs/btrfs/ordered-data.c97
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/qgroup.c48
-rw-r--r--fs/btrfs/reada.c18
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c29
-rw-r--r--fs/btrfs/scrub.c30
-rw-r--r--fs/btrfs/send.c915
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/transaction.c283
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c889
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c73
-rw-r--r--fs/btrfs/zlib.c8
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/export.c18
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/ioctl.c8
-rw-r--r--fs/ceph/mds_client.c3
-rw-r--r--fs/ceph/super.c42
-rw-r--r--fs/cifs/Kconfig38
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifs_unicode.c24
-rw-r--r--fs/cifs/cifsacl.c10
-rw-r--r--fs/cifs/cifsencrypt.c67
-rw-r--r--fs/cifs/cifsfs.c68
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h268
-rw-r--r--fs/cifs/cifspdu.h2
-rw-r--r--fs/cifs/cifsproto.h136
-rw-r--r--fs/cifs/cifssmb.c289
-rw-r--r--fs/cifs/connect.c207
-rw-r--r--fs/cifs/dir.c124
-rw-r--r--fs/cifs/file.c1058
-rw-r--r--fs/cifs/inode.c441
-rw-r--r--fs/cifs/ioctl.c32
-rw-r--r--fs/cifs/link.c74
-rw-r--r--fs/cifs/misc.c32
-rw-r--r--fs/cifs/netmisc.c9
-rw-r--r--fs/cifs/readdir.c167
-rw-r--r--fs/cifs/sess.c3
-rw-r--r--fs/cifs/smb1ops.c328
-rw-r--r--fs/cifs/smb2file.c302
-rw-r--r--fs/cifs/smb2glob.h14
-rw-r--r--fs/cifs/smb2inode.c98
-rw-r--r--fs/cifs/smb2maperror.c6
-rw-r--r--fs/cifs/smb2misc.c240
-rw-r--r--fs/cifs/smb2ops.c346
-rw-r--r--fs/cifs/smb2pdu.c1235
-rw-r--r--fs/cifs/smb2pdu.h288
-rw-r--r--fs/cifs/smb2proto.h87
-rw-r--r--fs/cifs/smb2transport.c204
-rw-r--r--fs/cifs/transport.c287
-rw-r--r--fs/coda/inode.c37
-rw-r--r--fs/compat.c124
-rw-r--r--fs/compat_binfmt_elf.c7
-rw-r--r--fs/compat_ioctl.c35
-rw-r--r--fs/configfs/inode.c4
-rw-r--r--fs/coredump.c693
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/debugfs/file.c76
-rw-r--r--fs/debugfs/inode.c34
-rw-r--r--fs/dlm/ast.c4
-rw-r--r--fs/dlm/config.c79
-rw-r--r--fs/dlm/config.h2
-rw-r--r--fs/dlm/dlm_internal.h46
-rw-r--r--fs/dlm/lockspace.c15
-rw-r--r--fs/dlm/lowcomms.c215
-rw-r--r--fs/dlm/lowcomms.h2
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/member.c17
-rw-r--r--fs/dlm/netlink.c8
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/recoverd.c27
-rw-r--r--fs/dlm/recoverd.h1
-rw-r--r--fs/dlm/user.c7
-rw-r--r--fs/ecryptfs/file.c10
-rw-r--r--fs/ecryptfs/inode.c5
-rw-r--r--fs/ecryptfs/main.c12
-rw-r--r--fs/ecryptfs/messaging.c5
-rw-r--r--fs/efs/inode.c4
-rw-r--r--fs/efs/super.c5
-rw-r--r--fs/eventpoll.c61
-rw-r--r--fs/exec.c750
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/exofs/ore.c5
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/exofs/super.c9
-rw-r--r--fs/exofs/sys.c7
-rw-r--r--fs/ext2/acl.c32
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/acl.c32
-rw-r--r--fs/ext3/balloc.c2
-rw-r--r--fs/ext3/inode.c19
-rw-r--r--fs/ext3/namei.c40
-rw-r--r--fs/ext3/namei.h19
-rw-r--r--fs/ext3/super.c25
-rw-r--r--fs/ext4/acl.c31
-rw-r--r--fs/ext4/ext4.h49
-rw-r--r--fs/ext4/extents.c258
-rw-r--r--fs/ext4/file.c8
-rw-r--r--fs/ext4/fsync.c92
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c97
-rw-r--r--fs/ext4/ioctl.c37
-rw-r--r--fs/ext4/mballoc.c131
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/move_extent.c520
-rw-r--r--fs/ext4/namei.c105
-rw-r--r--fs/ext4/page-io.c176
-rw-r--r--fs/ext4/resize.c432
-rw-r--r--fs/ext4/super.c99
-rw-r--r--fs/fat/Makefile2
-rw-r--r--fs/fat/cache.c10
-rw-r--r--fs/fat/dir.c60
-rw-r--r--fs/fat/fat.h98
-rw-r--r--fs/fat/fatent.c13
-rw-r--r--fs/fat/file.c6
-rw-r--r--fs/fat/inode.c215
-rw-r--r--fs/fat/namei_msdos.c33
-rw-r--r--fs/fat/namei_vfat.c35
-rw-r--r--fs/fat/nfs.c101
-rw-r--r--fs/fcntl.c166
-rw-r--r--fs/fhandle.c17
-rw-r--r--fs/file.c576
-rw-r--r--fs/file_table.c110
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/freevxfs/vxfs_inode.c4
-rw-r--r--fs/freevxfs/vxfs_super.c5
-rw-r--r--fs/fs-writeback.c14
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/fuse/inode.c18
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c14
-rw-r--r--fs/gfs2/aops.c11
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/export.c4
-rw-r--r--fs/gfs2/file.c37
-rw-r--r--fs/gfs2/glock.c60
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h30
-rw-r--r--fs/gfs2/inode.c28
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c43
-rw-r--r--fs/gfs2/rgrp.c1221
-rw-r--r--fs/gfs2/rgrp.h28
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/gfs2/trace_gfs2.h20
-rw-r--r--fs/gfs2/trans.h7
-rw-r--r--fs/gfs2/xattr.c96
-rw-r--r--fs/hfs/hfs_fs.h4
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfs/super.c22
-rw-r--r--fs/hfsplus/catalog.c4
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c8
-rw-r--r--fs/hfsplus/options.c15
-rw-r--r--fs/hfsplus/super.c6
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c20
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/anode.c6
-rw-r--r--fs/hpfs/dnode.c28
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c19
-rw-r--r--fs/hpfs/namei.c8
-rw-r--r--fs/hpfs/super.c26
-rw-r--r--fs/hppfs/hppfs.c4
-rw-r--r--fs/hugetlbfs/inode.c32
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h4
-rw-r--r--fs/ioctl.c25
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/inode.c22
-rw-r--r--fs/isofs/isofs.h4
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd/commit.c45
-rw-r--r--fs/jbd/transaction.c64
-rw-r--r--fs/jbd2/commit.c40
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c65
-rw-r--r--fs/jffs2/acl.c30
-rw-r--r--fs/jffs2/file.c8
-rw-r--r--fs/jffs2/fs.c24
-rw-r--r--fs/jffs2/os-linux.h4
-rw-r--r--fs/jffs2/readinode.c13
-rw-r--r--fs/jffs2/super.c10
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/ioctl.c43
-rw-r--r--fs/jfs/jfs_discard.c117
-rw-r--r--fs/jfs/jfs_discard.h26
-rw-r--r--fs/jfs/jfs_dmap.c126
-rw-r--r--fs/jfs/jfs_dmap.h2
-rw-r--r--fs/jfs/jfs_filsys.h3
-rw-r--r--fs/jfs/jfs_imap.c22
-rw-r--r--fs/jfs/jfs_incore.h9
-rw-r--r--fs/jfs/jfs_txnmgr.c9
-rw-r--r--fs/jfs/super.c99
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/mon.c86
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c18
-rw-r--r--fs/lockd/svclock.c3
-rw-r--r--fs/locks.c26
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/logfs/readwrite.c8
-rw-r--r--fs/minix/inode.c21
-rw-r--r--fs/namei.c263
-rw-r--r--fs/namespace.c26
-rw-r--r--fs/ncpfs/inode.c11
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c306
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c25
-rw-r--r--fs/nfs/blocklayout/extents.c3
-rw-r--r--fs/nfs/callback.c337
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c31
-rw-r--r--fs/nfs/client.c23
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c32
-rw-r--r--fs/nfs/file.c38
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c114
-rw-r--r--fs/nfs/inode.c17
-rw-r--r--fs/nfs/internal.h15
-rw-r--r--fs/nfs/netns.h4
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4_fs.h19
-rw-r--r--fs/nfs/nfs4client.c256
-rw-r--r--fs/nfs/nfs4file.c25
-rw-r--r--fs/nfs/nfs4filelayout.c41
-rw-r--r--fs/nfs/nfs4filelayout.h16
-rw-r--r--fs/nfs/nfs4filelayoutdev.c17
-rw-r--r--fs/nfs/nfs4namespace.c16
-rw-r--r--fs/nfs/nfs4proc.c397
-rw-r--r--fs/nfs/nfs4renewd.c3
-rw-r--r--fs/nfs/nfs4state.c228
-rw-r--r--fs/nfs/nfs4sysctl.c1
-rw-r--r--fs/nfs/nfs4xdr.c48
-rw-r--r--fs/nfs/objlayout/objio_osd.c9
-rw-r--r--fs/nfs/pagelist.c12
-rw-r--r--fs/nfs/pnfs.c417
-rw-r--r--fs/nfs/pnfs.h57
-rw-r--r--fs/nfs/pnfs_dev.c27
-rw-r--r--fs/nfs/super.c35
-rw-r--r--fs/nfs/write.c11
-rw-r--r--fs/nfsd/nfs2acl.c3
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4idmap.c4
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfs4state.c354
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c84
-rw-r--r--fs/nfsd/nfsd.h4
-rw-r--r--fs/nfsd/nfssvc.c26
-rw-r--r--fs/nfsd/state.h8
-rw-r--r--fs/nfsd/vfs.c10
-rw-r--r--fs/nilfs2/file.c3
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c87
-rw-r--r--fs/notify/inotify/inotify_user.c28
-rw-r--r--fs/ntfs/inode.c7
-rw-r--r--fs/ntfs/super.c45
-rw-r--r--fs/ntfs/volume.h5
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c5
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/quota_global.c43
-rw-r--r--fs/ocfs2/quota_local.c15
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/omfs/file.c5
-rw-r--r--fs/omfs/inode.c8
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/open.c159
-rw-r--r--fs/openpromfs/inode.c5
-rw-r--r--fs/pipe.c31
-rw-r--r--fs/posix_acl.c30
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/base.c566
-rw-r--r--fs/proc/fd.c367
-rw-r--r--fs/proc/fd.h14
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/internal.h48
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_sysctl.c13
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/ftrace.c96
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c9
-rw-r--r--fs/pstore/ram.c28
-rw-r--r--fs/qnx4/inode.c9
-rw-r--r--fs/qnx6/inode.c9
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/dquot.c116
-rw-r--r--fs/quota/kqid.c132
-rw-r--r--fs/quota/netlink.c10
-rw-r--r--fs/quota/quota.c32
-rw-r--r--fs/quota/quota_tree.c22
-rw-r--r--fs/quota/quota_v1.c12
-rw-r--r--fs/quota/quota_v2.c26
-rw-r--r--fs/read_write.c180
-rw-r--r--fs/read_write.h2
-rw-r--r--fs/readdir.c36
-rw-r--r--fs/reiserfs/inode.c32
-rw-r--r--fs/reiserfs/super.c5
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/reiserfs/xattr_acl.c24
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/select.c31
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/signalfd.c13
-rw-r--r--fs/splice.c69
-rw-r--r--fs/squashfs/inode.c8
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/stat.c14
-rw-r--r--fs/statfs.c9
-rw-r--r--fs/super.c31
-rw-r--r--fs/sync.c33
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysv/balloc.c18
-rw-r--r--fs/sysv/ialloc.c14
-rw-r--r--fs/sysv/inode.c17
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/timerfd.c45
-rw-r--r--fs/ubifs/budget.c9
-rw-r--r--fs/ubifs/commit.c8
-rw-r--r--fs/ubifs/compress.c7
-rw-r--r--fs/ubifs/debug.c633
-rw-r--r--fs/ubifs/debug.h15
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/gc.c6
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/log.c14
-rw-r--r--fs/ubifs/lprops.c66
-rw-r--r--fs/ubifs/lpt.c5
-rw-r--r--fs/ubifs/lpt_commit.c58
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c11
-rw-r--r--fs/ubifs/replay.c16
-rw-r--r--fs/ubifs/sb.c23
-rw-r--r--fs/ubifs/scan.c15
-rw-r--r--fs/ubifs/super.c129
-rw-r--r--fs/ubifs/tnc_misc.c4
-rw-r--r--fs/ubifs/ubifs.h17
-rw-r--r--fs/udf/file.c44
-rw-r--r--fs/udf/inode.c71
-rw-r--r--fs/udf/super.c25
-rw-r--r--fs/udf/udf_sb.h4
-rw-r--r--fs/ufs/balloc.c30
-rw-r--r--fs/ufs/ialloc.c16
-rw-r--r--fs/ufs/inode.c16
-rw-r--r--fs/ufs/super.c26
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/utimes.c11
-rw-r--r--fs/xattr.c249
-rw-r--r--fs/xattr_acl.c96
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_buf.c5
-rw-r--r--fs/xfs/xfs_buf.h41
-rw-r--r--fs/xfs/xfs_dfrag.c34
-rw-r--r--fs/xfs/xfs_export.c3
-rw-r--r--fs/xfs/xfs_file.c381
-rw-r--r--fs/xfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/xfs_ioctl.c10
-rw-r--r--fs/xfs/xfs_mount.c43
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_quotaops.c12
-rw-r--r--fs/xfs/xfs_super.c103
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_sync.c2
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans_dquot.c8
458 files changed, 19754 insertions, 10376 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 9a1d42630751..15b679166201 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -37,7 +37,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
37 return ERR_PTR(-ENOMEM); 37 return ERR_PTR(-ENOMEM);
38 size = v9fs_fid_xattr_get(fid, name, value, size); 38 size = v9fs_fid_xattr_get(fid, name, value, size);
39 if (size > 0) { 39 if (size > 0) {
40 acl = posix_acl_from_xattr(value, size); 40 acl = posix_acl_from_xattr(&init_user_ns, value, size);
41 if (IS_ERR(acl)) 41 if (IS_ERR(acl))
42 goto err_out; 42 goto err_out;
43 } 43 }
@@ -131,7 +131,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
131 buffer = kmalloc(size, GFP_KERNEL); 131 buffer = kmalloc(size, GFP_KERNEL);
132 if (!buffer) 132 if (!buffer)
133 return -ENOMEM; 133 return -ENOMEM;
134 retval = posix_acl_to_xattr(acl, buffer, size); 134 retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
135 if (retval < 0) 135 if (retval < 0)
136 goto err_free_out; 136 goto err_free_out;
137 switch (type) { 137 switch (type) {
@@ -251,7 +251,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
251 return PTR_ERR(acl); 251 return PTR_ERR(acl);
252 if (acl == NULL) 252 if (acl == NULL)
253 return -ENODATA; 253 return -ENODATA;
254 error = posix_acl_to_xattr(acl, buffer, size); 254 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
255 posix_acl_release(acl); 255 posix_acl_release(acl);
256 256
257 return error; 257 return error;
@@ -304,7 +304,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
304 return -EPERM; 304 return -EPERM;
305 if (value) { 305 if (value) {
306 /* update the cached acl value */ 306 /* update the cached acl value */
307 acl = posix_acl_from_xattr(value, size); 307 acl = posix_acl_from_xattr(&init_user_ns, value, size);
308 if (IS_ERR(acl)) 308 if (IS_ERR(acl))
309 return PTR_ERR(acl); 309 return PTR_ERR(acl);
310 else if (acl) { 310 else if (acl) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index b85efa773949..d934f04e7736 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -184,10 +184,20 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
184 v9ses->afid = option; 184 v9ses->afid = option;
185 break; 185 break;
186 case Opt_uname: 186 case Opt_uname:
187 match_strlcpy(v9ses->uname, &args[0], PATH_MAX); 187 kfree(v9ses->uname);
188 v9ses->uname = match_strdup(&args[0]);
189 if (!v9ses->uname) {
190 ret = -ENOMEM;
191 goto free_and_return;
192 }
188 break; 193 break;
189 case Opt_remotename: 194 case Opt_remotename:
190 match_strlcpy(v9ses->aname, &args[0], PATH_MAX); 195 kfree(v9ses->aname);
196 v9ses->aname = match_strdup(&args[0]);
197 if (!v9ses->aname) {
198 ret = -ENOMEM;
199 goto free_and_return;
200 }
191 break; 201 break;
192 case Opt_nodevmap: 202 case Opt_nodevmap:
193 v9ses->nodev = 1; 203 v9ses->nodev = 1;
@@ -287,21 +297,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
287 struct p9_fid *fid; 297 struct p9_fid *fid;
288 int rc; 298 int rc;
289 299
290 v9ses->uname = __getname(); 300 v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
291 if (!v9ses->uname) 301 if (!v9ses->uname)
292 return ERR_PTR(-ENOMEM); 302 return ERR_PTR(-ENOMEM);
293 303
294 v9ses->aname = __getname(); 304 v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
295 if (!v9ses->aname) { 305 if (!v9ses->aname) {
296 __putname(v9ses->uname); 306 kfree(v9ses->uname);
297 return ERR_PTR(-ENOMEM); 307 return ERR_PTR(-ENOMEM);
298 } 308 }
299 init_rwsem(&v9ses->rename_sem); 309 init_rwsem(&v9ses->rename_sem);
300 310
301 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 311 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
302 if (rc) { 312 if (rc) {
303 __putname(v9ses->aname); 313 kfree(v9ses->aname);
304 __putname(v9ses->uname); 314 kfree(v9ses->uname);
305 return ERR_PTR(rc); 315 return ERR_PTR(rc);
306 } 316 }
307 317
@@ -309,8 +319,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
309 list_add(&v9ses->slist, &v9fs_sessionlist); 319 list_add(&v9ses->slist, &v9fs_sessionlist);
310 spin_unlock(&v9fs_sessionlist_lock); 320 spin_unlock(&v9fs_sessionlist_lock);
311 321
312 strcpy(v9ses->uname, V9FS_DEFUSER);
313 strcpy(v9ses->aname, V9FS_DEFANAME);
314 v9ses->uid = ~0; 322 v9ses->uid = ~0;
315 v9ses->dfltuid = V9FS_DEFUID; 323 v9ses->dfltuid = V9FS_DEFUID;
316 v9ses->dfltgid = V9FS_DEFGID; 324 v9ses->dfltgid = V9FS_DEFGID;
@@ -412,8 +420,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
412 kfree(v9ses->cachetag); 420 kfree(v9ses->cachetag);
413 } 421 }
414#endif 422#endif
415 __putname(v9ses->uname); 423 kfree(v9ses->uname);
416 __putname(v9ses->aname); 424 kfree(v9ses->aname);
417 425
418 bdi_destroy(&v9ses->bdi); 426 bdi_destroy(&v9ses->bdi);
419 427
@@ -560,6 +568,11 @@ static int v9fs_init_inode_cache(void)
560 */ 568 */
561static void v9fs_destroy_inode_cache(void) 569static void v9fs_destroy_inode_cache(void)
562{ 570{
571 /*
572 * Make sure all delayed rcu free inodes are flushed before we
573 * destroy cache.
574 */
575 rcu_barrier();
563 kmem_cache_destroy(v9fs_inode_cache); 576 kmem_cache_destroy(v9fs_inode_cache);
564} 577}
565 578
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index dd6f7ee1e312..c2483e97beee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
738static const struct vm_operations_struct v9fs_file_vm_ops = { 738static const struct vm_operations_struct v9fs_file_vm_ops = {
739 .fault = filemap_fault, 739 .fault = filemap_fault,
740 .page_mkwrite = v9fs_vm_page_mkwrite, 740 .page_mkwrite = v9fs_vm_page_mkwrite,
741 .remap_pages = generic_file_remap_pages,
741}; 742};
742 743
743 744
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index cbf9dbb1b2a2..890bed538f9b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1276,12 +1276,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1276 } 1276 }
1277 1277
1278 /* copy extension buffer into buffer */ 1278 /* copy extension buffer into buffer */
1279 strncpy(buffer, st->extension, buflen); 1279 retval = min(strlen(st->extension)+1, (size_t)buflen);
1280 memcpy(buffer, st->extension, retval);
1280 1281
1281 p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", 1282 p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
1282 dentry->d_name.name, st->extension, buffer); 1283 dentry->d_name.name, st->extension, buflen, buffer);
1283 1284
1284 retval = strnlen(buffer, buflen);
1285done: 1285done:
1286 p9stat_free(st); 1286 p9stat_free(st);
1287 kfree(st); 1287 kfree(st);
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 022574202749..0efd1524b977 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -164,3 +164,11 @@ config BINFMT_MISC
164 You may say M here for module support and later load the module when 164 You may say M here for module support and later load the module when
165 you have use for it; the module is called binfmt_misc. If you 165 you have use for it; the module is called binfmt_misc. If you
166 don't know what to answer at this point, say Y. 166 don't know what to answer at this point, say Y.
167
168config COREDUMP
169 bool "Enable core dump support" if EXPERT
170 default y
171 help
172 This option enables support for performing core dumps. You almost
173 certainly want to say Y here. Not necessary on systems that never
174 need debugging or only ever run flawless code.
diff --git a/fs/Makefile b/fs/Makefile
index 2fb977934673..1d7af79288a0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
48obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o 48obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
49obj-$(CONFIG_NFS_COMMON) += nfs_common/ 49obj-$(CONFIG_NFS_COMMON) += nfs_common/
50obj-$(CONFIG_GENERIC_ACL) += generic_acl.o 50obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
51obj-$(CONFIG_COREDUMP) += coredump.o
51 52
52obj-$(CONFIG_FHANDLE) += fhandle.o 53obj-$(CONFIG_FHANDLE) += fhandle.o
53 54
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 718ac1f440c6..585adafb0cc2 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -46,8 +46,8 @@ struct adfs_sb_info {
46 struct adfs_discmap *s_map; /* bh list containing map */ 46 struct adfs_discmap *s_map; /* bh list containing map */
47 struct adfs_dir_ops *s_dir; /* directory operations */ 47 struct adfs_dir_ops *s_dir; /* directory operations */
48 48
49 uid_t s_uid; /* owner uid */ 49 kuid_t s_uid; /* owner uid */
50 gid_t s_gid; /* owner gid */ 50 kgid_t s_gid; /* owner gid */
51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ 51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
52 umode_t s_other_mask; /* ADFS other perm -> unix perm */ 52 umode_t s_other_mask; /* ADFS other perm -> unix perm */
53 int s_ftsuffix; /* ,xyz hex filetype suffix option */ 53 int s_ftsuffix; /* ,xyz hex filetype suffix option */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 1dab6a174d6a..e9bad5093a3f 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -304,8 +304,8 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
304 * we can't change the UID or GID of any file - 304 * we can't change the UID or GID of any file -
305 * we have a global UID/GID in the superblock 305 * we have a global UID/GID in the superblock
306 */ 306 */
307 if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) || 307 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) ||
308 (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid)) 308 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid)))
309 error = -EPERM; 309 error = -EPERM;
310 310
311 if (error) 311 if (error)
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdaec92353c2..d57122935793 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/statfs.h> 17#include <linux/statfs.h>
18#include <linux/user_namespace.h>
18#include "adfs.h" 19#include "adfs.h"
19#include "dir_f.h" 20#include "dir_f.h"
20#include "dir_fplus.h" 21#include "dir_fplus.h"
@@ -130,10 +131,10 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
130{ 131{
131 struct adfs_sb_info *asb = ADFS_SB(root->d_sb); 132 struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
132 133
133 if (asb->s_uid != 0) 134 if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID))
134 seq_printf(seq, ",uid=%u", asb->s_uid); 135 seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid));
135 if (asb->s_gid != 0) 136 if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID))
136 seq_printf(seq, ",gid=%u", asb->s_gid); 137 seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid));
137 if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) 138 if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
138 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); 139 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
139 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) 140 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
@@ -175,12 +176,16 @@ static int parse_options(struct super_block *sb, char *options)
175 case Opt_uid: 176 case Opt_uid:
176 if (match_int(args, &option)) 177 if (match_int(args, &option))
177 return -EINVAL; 178 return -EINVAL;
178 asb->s_uid = option; 179 asb->s_uid = make_kuid(current_user_ns(), option);
180 if (!uid_valid(asb->s_uid))
181 return -EINVAL;
179 break; 182 break;
180 case Opt_gid: 183 case Opt_gid:
181 if (match_int(args, &option)) 184 if (match_int(args, &option))
182 return -EINVAL; 185 return -EINVAL;
183 asb->s_gid = option; 186 asb->s_gid = make_kgid(current_user_ns(), option);
187 if (!gid_valid(asb->s_gid))
188 return -EINVAL;
184 break; 189 break;
185 case Opt_ownmask: 190 case Opt_ownmask:
186 if (match_octal(args, &option)) 191 if (match_octal(args, &option))
@@ -275,6 +280,11 @@ static int init_inodecache(void)
275 280
276static void destroy_inodecache(void) 281static void destroy_inodecache(void)
277{ 282{
283 /*
284 * Make sure all delayed rcu free inodes are flushed before we
285 * destroy cache.
286 */
287 rcu_barrier();
278 kmem_cache_destroy(adfs_inode_cachep); 288 kmem_cache_destroy(adfs_inode_cachep);
279} 289}
280 290
@@ -369,8 +379,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
369 sb->s_fs_info = asb; 379 sb->s_fs_info = asb;
370 380
371 /* set default options */ 381 /* set default options */
372 asb->s_uid = 0; 382 asb->s_uid = GLOBAL_ROOT_UID;
373 asb->s_gid = 0; 383 asb->s_gid = GLOBAL_ROOT_GID;
374 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; 384 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
375 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; 385 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
376 asb->s_ftsuffix = 0; 386 asb->s_ftsuffix = 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 6e216419f340..3952121f2f28 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -88,8 +88,8 @@ struct affs_sb_info {
88 u32 s_root_block; /* FFS root block number. */ 88 u32 s_root_block; /* FFS root block number. */
89 int s_hashsize; /* Size of hash table. */ 89 int s_hashsize; /* Size of hash table. */
90 unsigned long s_flags; /* See below. */ 90 unsigned long s_flags; /* See below. */
91 uid_t s_uid; /* uid to override */ 91 kuid_t s_uid; /* uid to override */
92 gid_t s_gid; /* gid to override */ 92 kgid_t s_gid; /* gid to override */
93 umode_t s_mode; /* mode to override */ 93 umode_t s_mode; /* mode to override */
94 struct buffer_head *s_root_bh; /* Cached root block. */ 94 struct buffer_head *s_root_bh; /* Cached root block. */
95 struct mutex s_bmlock; /* Protects bitmap access. */ 95 struct mutex s_bmlock; /* Protects bitmap access. */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 8bc4a59f4e7e..15c484268229 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -80,17 +80,17 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
80 if (id == 0 || sbi->s_flags & SF_SETUID) 80 if (id == 0 || sbi->s_flags & SF_SETUID)
81 inode->i_uid = sbi->s_uid; 81 inode->i_uid = sbi->s_uid;
82 else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) 82 else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
83 inode->i_uid = 0; 83 i_uid_write(inode, 0);
84 else 84 else
85 inode->i_uid = id; 85 i_uid_write(inode, id);
86 86
87 id = be16_to_cpu(tail->gid); 87 id = be16_to_cpu(tail->gid);
88 if (id == 0 || sbi->s_flags & SF_SETGID) 88 if (id == 0 || sbi->s_flags & SF_SETGID)
89 inode->i_gid = sbi->s_gid; 89 inode->i_gid = sbi->s_gid;
90 else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) 90 else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
91 inode->i_gid = 0; 91 i_gid_write(inode, 0);
92 else 92 else
93 inode->i_gid = id; 93 i_gid_write(inode, id);
94 94
95 switch (be32_to_cpu(tail->stype)) { 95 switch (be32_to_cpu(tail->stype)) {
96 case ST_ROOT: 96 case ST_ROOT:
@@ -193,13 +193,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
193 tail->size = cpu_to_be32(inode->i_size); 193 tail->size = cpu_to_be32(inode->i_size);
194 secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change); 194 secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
195 if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { 195 if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
196 uid = inode->i_uid; 196 uid = i_uid_read(inode);
197 gid = inode->i_gid; 197 gid = i_gid_read(inode);
198 if (AFFS_SB(sb)->s_flags & SF_MUFS) { 198 if (AFFS_SB(sb)->s_flags & SF_MUFS) {
199 if (inode->i_uid == 0 || inode->i_uid == 0xFFFF) 199 if (uid == 0 || uid == 0xFFFF)
200 uid = inode->i_uid ^ ~0; 200 uid = uid ^ ~0;
201 if (inode->i_gid == 0 || inode->i_gid == 0xFFFF) 201 if (gid == 0 || gid == 0xFFFF)
202 gid = inode->i_gid ^ ~0; 202 gid = gid ^ ~0;
203 } 203 }
204 if (!(AFFS_SB(sb)->s_flags & SF_SETUID)) 204 if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
205 tail->uid = cpu_to_be16(uid); 205 tail->uid = cpu_to_be16(uid);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c70f1e5fc024..b84dc7352502 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -147,6 +147,11 @@ static int init_inodecache(void)
147 147
148static void destroy_inodecache(void) 148static void destroy_inodecache(void)
149{ 149{
150 /*
151 * Make sure all delayed rcu free inodes are flushed before we
152 * destroy cache.
153 */
154 rcu_barrier();
150 kmem_cache_destroy(affs_inode_cachep); 155 kmem_cache_destroy(affs_inode_cachep);
151} 156}
152 157
@@ -188,7 +193,7 @@ static const match_table_t tokens = {
188}; 193};
189 194
190static int 195static int
191parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root, 196parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
192 int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) 197 int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
193{ 198{
194 char *p; 199 char *p;
@@ -253,13 +258,17 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
253 case Opt_setgid: 258 case Opt_setgid:
254 if (match_int(&args[0], &option)) 259 if (match_int(&args[0], &option))
255 return 0; 260 return 0;
256 *gid = option; 261 *gid = make_kgid(current_user_ns(), option);
262 if (!gid_valid(*gid))
263 return 0;
257 *mount_opts |= SF_SETGID; 264 *mount_opts |= SF_SETGID;
258 break; 265 break;
259 case Opt_setuid: 266 case Opt_setuid:
260 if (match_int(&args[0], &option)) 267 if (match_int(&args[0], &option))
261 return 0; 268 return 0;
262 *uid = option; 269 *uid = make_kuid(current_user_ns(), option);
270 if (!uid_valid(*uid))
271 return 0;
263 *mount_opts |= SF_SETUID; 272 *mount_opts |= SF_SETUID;
264 break; 273 break;
265 case Opt_verbose: 274 case Opt_verbose:
@@ -301,8 +310,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
301 int num_bm; 310 int num_bm;
302 int i, j; 311 int i, j;
303 s32 key; 312 s32 key;
304 uid_t uid; 313 kuid_t uid;
305 gid_t gid; 314 kgid_t gid;
306 int reserved; 315 int reserved;
307 unsigned long mount_flags; 316 unsigned long mount_flags;
308 int tmp_flags; /* fix remount prototype... */ 317 int tmp_flags; /* fix remount prototype... */
@@ -527,8 +536,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
527{ 536{
528 struct affs_sb_info *sbi = AFFS_SB(sb); 537 struct affs_sb_info *sbi = AFFS_SB(sb);
529 int blocksize; 538 int blocksize;
530 uid_t uid; 539 kuid_t uid;
531 gid_t gid; 540 kgid_t gid;
532 int mode; 541 int mode;
533 int reserved; 542 int reserved;
534 int root_block; 543 int root_block;
@@ -551,7 +560,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
551 return -EINVAL; 560 return -EINVAL;
552 } 561 }
553 562
554 flush_delayed_work_sync(&sbi->sb_work); 563 flush_delayed_work(&sbi->sb_work);
555 replace_mount_options(sb, new_opts); 564 replace_mount_options(sb, new_opts);
556 565
557 sbi->s_flags = mount_flags; 566 sbi->s_flags = mount_flags;
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 587ef5123cd8..7ef637d7f3a5 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -351,9 +351,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work)
351 */ 351 */
352void afs_flush_callback_breaks(struct afs_server *server) 352void afs_flush_callback_breaks(struct afs_server *server)
353{ 353{
354 cancel_delayed_work(&server->cb_break_work); 354 mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
355 queue_delayed_work(afs_callback_update_worker,
356 &server->cb_break_work, 0);
357} 355}
358 356
359#if 0 357#if 0
diff --git a/fs/afs/server.c b/fs/afs/server.c
index d59b7516e943..f342acf3547d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -285,12 +285,7 @@ static void afs_reap_server(struct work_struct *work)
285 expiry = server->time_of_death + afs_server_timeout; 285 expiry = server->time_of_death + afs_server_timeout;
286 if (expiry > now) { 286 if (expiry > now) {
287 delay = (expiry - now) * HZ; 287 delay = (expiry - now) * HZ;
288 if (!queue_delayed_work(afs_wq, &afs_server_reaper, 288 mod_delayed_work(afs_wq, &afs_server_reaper, delay);
289 delay)) {
290 cancel_delayed_work(&afs_server_reaper);
291 queue_delayed_work(afs_wq, &afs_server_reaper,
292 delay);
293 }
294 break; 289 break;
295 } 290 }
296 291
@@ -323,6 +318,5 @@ static void afs_reap_server(struct work_struct *work)
323void __exit afs_purge_servers(void) 318void __exit afs_purge_servers(void)
324{ 319{
325 afs_server_timeout = 0; 320 afs_server_timeout = 0;
326 cancel_delayed_work(&afs_server_reaper); 321 mod_delayed_work(afs_wq, &afs_server_reaper, 0);
327 queue_delayed_work(afs_wq, &afs_server_reaper, 0);
328} 322}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index df8c6047c2a1..43165009428d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -123,6 +123,11 @@ void __exit afs_fs_exit(void)
123 BUG(); 123 BUG();
124 } 124 }
125 125
126 /*
127 * Make sure all delayed rcu free inodes are flushed before we
128 * destroy cache.
129 */
130 rcu_barrier();
126 kmem_cache_destroy(afs_inode_cachep); 131 kmem_cache_destroy(afs_inode_cachep);
127 _leave(""); 132 _leave("");
128} 133}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 431984d2e372..57bcb1596530 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -561,12 +561,7 @@ static void afs_vlocation_reaper(struct work_struct *work)
561 if (expiry > now) { 561 if (expiry > now) {
562 delay = (expiry - now) * HZ; 562 delay = (expiry - now) * HZ;
563 _debug("delay %lu", delay); 563 _debug("delay %lu", delay);
564 if (!queue_delayed_work(afs_wq, &afs_vlocation_reap, 564 mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
565 delay)) {
566 cancel_delayed_work(&afs_vlocation_reap);
567 queue_delayed_work(afs_wq, &afs_vlocation_reap,
568 delay);
569 }
570 break; 565 break;
571 } 566 }
572 567
@@ -614,13 +609,10 @@ void afs_vlocation_purge(void)
614 spin_lock(&afs_vlocation_updates_lock); 609 spin_lock(&afs_vlocation_updates_lock);
615 list_del_init(&afs_vlocation_updates); 610 list_del_init(&afs_vlocation_updates);
616 spin_unlock(&afs_vlocation_updates_lock); 611 spin_unlock(&afs_vlocation_updates_lock);
617 cancel_delayed_work(&afs_vlocation_update); 612 mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
618 queue_delayed_work(afs_vlocation_update_worker,
619 &afs_vlocation_update, 0);
620 destroy_workqueue(afs_vlocation_update_worker); 613 destroy_workqueue(afs_vlocation_update_worker);
621 614
622 cancel_delayed_work(&afs_vlocation_reap); 615 mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
623 queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
624} 616}
625 617
626/* 618/*
diff --git a/fs/attr.c b/fs/attr.c
index 29e38a1f7f77..cce7df53b694 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,6 +14,7 @@
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/evm.h> 16#include <linux/evm.h>
17#include <linux/ima.h>
17 18
18/** 19/**
19 * inode_change_ok - check if attribute changes to an inode are allowed 20 * inode_change_ok - check if attribute changes to an inode are allowed
@@ -247,6 +248,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
247 248
248 if (!error) { 249 if (!error) {
249 fsnotify_change(dentry, ia_valid); 250 fsnotify_change(dentry, ia_valid);
251 ima_inode_post_setattr(dentry);
250 evm_inode_post_setattr(dentry, ia_valid); 252 evm_inode_post_setattr(dentry, ia_valid);
251 } 253 }
252 254
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index abf645c1703b..a16214109d31 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p)
221 return ino && ino->sbi->type & *(unsigned *)p; 221 return ino && ino->sbi->type & *(unsigned *)p;
222} 222}
223 223
224static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
225{
226 struct files_struct *files = current->files;
227 struct fdtable *fdt;
228
229 spin_lock(&files->file_lock);
230 fdt = files_fdtable(files);
231 BUG_ON(fdt->fd[fd] != NULL);
232 rcu_assign_pointer(fdt->fd[fd], file);
233 __set_close_on_exec(fd, fdt);
234 spin_unlock(&files->file_lock);
235}
236
237
238/* 224/*
239 * Open a file descriptor on the autofs mount point corresponding 225 * Open a file descriptor on the autofs mount point corresponding
240 * to the given path and device number (aka. new_encode_dev(sb->s_dev)). 226 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
@@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
243{ 229{
244 int err, fd; 230 int err, fd;
245 231
246 fd = get_unused_fd(); 232 fd = get_unused_fd_flags(O_CLOEXEC);
247 if (likely(fd >= 0)) { 233 if (likely(fd >= 0)) {
248 struct file *filp; 234 struct file *filp;
249 struct path path; 235 struct path path;
@@ -264,7 +250,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
264 goto out; 250 goto out;
265 } 251 }
266 252
267 autofs_dev_ioctl_fd_install(fd, filp); 253 fd_install(fd, filp);
268 } 254 }
269 255
270 return fd; 256 return fd;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e7396cfdb109..91b11650722e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -392,10 +392,12 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
392 ino->flags |= AUTOFS_INF_PENDING; 392 ino->flags |= AUTOFS_INF_PENDING;
393 spin_unlock(&sbi->fs_lock); 393 spin_unlock(&sbi->fs_lock);
394 status = autofs4_mount_wait(dentry); 394 status = autofs4_mount_wait(dentry);
395 if (status)
396 return ERR_PTR(status);
397 spin_lock(&sbi->fs_lock); 395 spin_lock(&sbi->fs_lock);
398 ino->flags &= ~AUTOFS_INF_PENDING; 396 ino->flags &= ~AUTOFS_INF_PENDING;
397 if (status) {
398 spin_unlock(&sbi->fs_lock);
399 return ERR_PTR(status);
400 }
399 } 401 }
400done: 402done:
401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 403 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index da8876d38a7b..dce436e595c1 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -175,8 +175,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
175 return; 175 return;
176 } 176 }
177 177
178 pipe = sbi->pipe; 178 pipe = get_file(sbi->pipe);
179 get_file(pipe);
180 179
181 mutex_unlock(&sbi->wq_mutex); 180 mutex_unlock(&sbi->wq_mutex);
182 181
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index d9a40abda6b7..b26642839156 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -20,8 +20,8 @@ typedef u64 befs_blocknr_t;
20 */ 20 */
21 21
22typedef struct befs_mount_options { 22typedef struct befs_mount_options {
23 gid_t gid; 23 kgid_t gid;
24 uid_t uid; 24 kuid_t uid;
25 int use_gid; 25 int use_gid;
26 int use_uid; 26 int use_uid;
27 int debug; 27 int debug;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cf7f3c67c8b7..2b3bda8d5e68 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -15,6 +15,7 @@
15#include <linux/vfs.h> 15#include <linux/vfs.h>
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/sched.h>
18 19
19#include "befs.h" 20#include "befs.h"
20#include "btree.h" 21#include "btree.h"
@@ -352,9 +353,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
352 */ 353 */
353 354
354 inode->i_uid = befs_sb->mount_opts.use_uid ? 355 inode->i_uid = befs_sb->mount_opts.use_uid ?
355 befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid); 356 befs_sb->mount_opts.uid :
357 make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid));
356 inode->i_gid = befs_sb->mount_opts.use_gid ? 358 inode->i_gid = befs_sb->mount_opts.use_gid ?
357 befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); 359 befs_sb->mount_opts.gid :
360 make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid));
358 361
359 set_nlink(inode, 1); 362 set_nlink(inode, 1);
360 363
@@ -454,6 +457,11 @@ befs_init_inodecache(void)
454static void 457static void
455befs_destroy_inodecache(void) 458befs_destroy_inodecache(void)
456{ 459{
460 /*
461 * Make sure all delayed rcu free inodes are flushed before we
462 * destroy cache.
463 */
464 rcu_barrier();
457 kmem_cache_destroy(befs_inode_cachep); 465 kmem_cache_destroy(befs_inode_cachep);
458} 466}
459 467
@@ -674,10 +682,12 @@ parse_options(char *options, befs_mount_options * opts)
674 char *p; 682 char *p;
675 substring_t args[MAX_OPT_ARGS]; 683 substring_t args[MAX_OPT_ARGS];
676 int option; 684 int option;
685 kuid_t uid;
686 kgid_t gid;
677 687
678 /* Initialize options */ 688 /* Initialize options */
679 opts->uid = 0; 689 opts->uid = GLOBAL_ROOT_UID;
680 opts->gid = 0; 690 opts->gid = GLOBAL_ROOT_GID;
681 opts->use_uid = 0; 691 opts->use_uid = 0;
682 opts->use_gid = 0; 692 opts->use_gid = 0;
683 opts->iocharset = NULL; 693 opts->iocharset = NULL;
@@ -696,23 +706,29 @@ parse_options(char *options, befs_mount_options * opts)
696 case Opt_uid: 706 case Opt_uid:
697 if (match_int(&args[0], &option)) 707 if (match_int(&args[0], &option))
698 return 0; 708 return 0;
699 if (option < 0) { 709 uid = INVALID_UID;
710 if (option >= 0)
711 uid = make_kuid(current_user_ns(), option);
712 if (!uid_valid(uid)) {
700 printk(KERN_ERR "BeFS: Invalid uid %d, " 713 printk(KERN_ERR "BeFS: Invalid uid %d, "
701 "using default\n", option); 714 "using default\n", option);
702 break; 715 break;
703 } 716 }
704 opts->uid = option; 717 opts->uid = uid;
705 opts->use_uid = 1; 718 opts->use_uid = 1;
706 break; 719 break;
707 case Opt_gid: 720 case Opt_gid:
708 if (match_int(&args[0], &option)) 721 if (match_int(&args[0], &option))
709 return 0; 722 return 0;
710 if (option < 0) { 723 gid = INVALID_GID;
724 if (option >= 0)
725 gid = make_kgid(current_user_ns(), option);
726 if (!gid_valid(gid)) {
711 printk(KERN_ERR "BeFS: Invalid gid %d, " 727 printk(KERN_ERR "BeFS: Invalid gid %d, "
712 "using default\n", option); 728 "using default\n", option);
713 break; 729 break;
714 } 730 }
715 opts->gid = option; 731 opts->gid = gid;
716 opts->use_gid = 1; 732 opts->use_gid = 1;
717 break; 733 break;
718 case Opt_charset: 734 case Opt_charset:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 9870417c26e7..737aaa3f7090 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -76,8 +76,8 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
76 BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); 76 BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock);
77 BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); 77 BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock);
78 BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); 78 BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
79 inode->i_uid = le32_to_cpu(di->i_uid); 79 i_uid_write(inode, le32_to_cpu(di->i_uid));
80 inode->i_gid = le32_to_cpu(di->i_gid); 80 i_gid_write(inode, le32_to_cpu(di->i_gid));
81 set_nlink(inode, le32_to_cpu(di->i_nlink)); 81 set_nlink(inode, le32_to_cpu(di->i_nlink));
82 inode->i_size = BFS_FILESIZE(di); 82 inode->i_size = BFS_FILESIZE(di);
83 inode->i_blocks = BFS_FILEBLOCKS(di); 83 inode->i_blocks = BFS_FILEBLOCKS(di);
@@ -139,8 +139,8 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
139 139
140 di->i_ino = cpu_to_le16(ino); 140 di->i_ino = cpu_to_le16(ino);
141 di->i_mode = cpu_to_le32(inode->i_mode); 141 di->i_mode = cpu_to_le32(inode->i_mode);
142 di->i_uid = cpu_to_le32(inode->i_uid); 142 di->i_uid = cpu_to_le32(i_uid_read(inode));
143 di->i_gid = cpu_to_le32(inode->i_gid); 143 di->i_gid = cpu_to_le32(i_gid_read(inode));
144 di->i_nlink = cpu_to_le32(inode->i_nlink); 144 di->i_nlink = cpu_to_le32(inode->i_nlink);
145 di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 145 di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
146 di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 146 di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -280,6 +280,11 @@ static int init_inodecache(void)
280 280
281static void destroy_inodecache(void) 281static void destroy_inodecache(void)
282{ 282{
283 /*
284 * Make sure all delayed rcu free inodes are flushed before we
285 * destroy cache.
286 */
287 rcu_barrier();
283 kmem_cache_destroy(bfs_inode_cachep); 288 kmem_cache_destroy(bfs_inode_cachep);
284} 289}
285 290
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index d146e181d10d..0e7a6f81ae36 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -32,31 +32,8 @@
32 32
33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
34static int load_aout_library(struct file*); 34static int load_aout_library(struct file*);
35static int aout_core_dump(struct coredump_params *cprm);
36
37static struct linux_binfmt aout_format = {
38 .module = THIS_MODULE,
39 .load_binary = load_aout_binary,
40 .load_shlib = load_aout_library,
41 .core_dump = aout_core_dump,
42 .min_coredump = PAGE_SIZE
43};
44
45#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
46
47static int set_brk(unsigned long start, unsigned long end)
48{
49 start = PAGE_ALIGN(start);
50 end = PAGE_ALIGN(end);
51 if (end > start) {
52 unsigned long addr;
53 addr = vm_brk(start, end - start);
54 if (BAD_ADDR(addr))
55 return addr;
56 }
57 return 0;
58}
59 35
36#ifdef CONFIG_COREDUMP
60/* 37/*
61 * Routine writes a core dump image in the current directory. 38 * Routine writes a core dump image in the current directory.
62 * Currently only a stub-function. 39 * Currently only a stub-function.
@@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end)
66 * field, which also makes sure the core-dumps won't be recursive if the 43 * field, which also makes sure the core-dumps won't be recursive if the
67 * dumping of the process results in another error.. 44 * dumping of the process results in another error..
68 */ 45 */
69
70static int aout_core_dump(struct coredump_params *cprm) 46static int aout_core_dump(struct coredump_params *cprm)
71{ 47{
72 struct file *file = cprm->file; 48 struct file *file = cprm->file;
@@ -89,7 +65,7 @@ static int aout_core_dump(struct coredump_params *cprm)
89 current->flags |= PF_DUMPCORE; 65 current->flags |= PF_DUMPCORE;
90 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); 66 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
91 dump.u_ar0 = offsetof(struct user, regs); 67 dump.u_ar0 = offsetof(struct user, regs);
92 dump.signal = cprm->signr; 68 dump.signal = cprm->siginfo->si_signo;
93 aout_dump_thread(cprm->regs, &dump); 69 aout_dump_thread(cprm->regs, &dump);
94 70
95/* If the size of the dump file exceeds the rlimit, then see what would happen 71/* If the size of the dump file exceeds the rlimit, then see what would happen
@@ -135,6 +111,32 @@ end_coredump:
135 set_fs(fs); 111 set_fs(fs);
136 return has_dumped; 112 return has_dumped;
137} 113}
114#else
115#define aout_core_dump NULL
116#endif
117
118static struct linux_binfmt aout_format = {
119 .module = THIS_MODULE,
120 .load_binary = load_aout_binary,
121 .load_shlib = load_aout_library,
122 .core_dump = aout_core_dump,
123 .min_coredump = PAGE_SIZE
124};
125
126#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
127
128static int set_brk(unsigned long start, unsigned long end)
129{
130 start = PAGE_ALIGN(start);
131 end = PAGE_ALIGN(end);
132 if (end > start) {
133 unsigned long addr;
134 addr = vm_brk(start, end - start);
135 if (BAD_ADDR(addr))
136 return addr;
137 }
138 return 0;
139}
138 140
139/* 141/*
140 * create_aout_tables() parses the env- and arg-strings in new user 142 * create_aout_tables() parses the env- and arg-strings in new user
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956afe33..fbd9f60bd763 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,6 +27,7 @@
27#include <linux/compiler.h> 27#include <linux/compiler.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/pagemap.h> 29#include <linux/pagemap.h>
30#include <linux/vmalloc.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include <linux/random.h> 32#include <linux/random.h>
32#include <linux/elf.h> 33#include <linux/elf.h>
@@ -35,7 +36,13 @@
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/param.h> 37#include <asm/param.h>
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/exec.h> 39
40#ifndef user_long_t
41#define user_long_t long
42#endif
43#ifndef user_siginfo_t
44#define user_siginfo_t siginfo_t
45#endif
39 46
40static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 47static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
41static int load_elf_library(struct file *); 48static int load_elf_library(struct file *);
@@ -881,7 +888,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
881 } 888 }
882 889
883 if (elf_interpreter) { 890 if (elf_interpreter) {
884 unsigned long uninitialized_var(interp_map_addr); 891 unsigned long interp_map_addr = 0;
885 892
886 elf_entry = load_elf_interp(&loc->interp_elf_ex, 893 elf_entry = load_elf_interp(&loc->interp_elf_ex,
887 interpreter, 894 interpreter,
@@ -1115,7 +1122,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1115 if (always_dump_vma(vma)) 1122 if (always_dump_vma(vma))
1116 goto whole; 1123 goto whole;
1117 1124
1118 if (vma->vm_flags & VM_NODUMP) 1125 if (vma->vm_flags & VM_DONTDUMP)
1119 return 0; 1126 return 0;
1120 1127
1121 /* Hugetlb memory check */ 1128 /* Hugetlb memory check */
@@ -1127,7 +1134,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1127 } 1134 }
1128 1135
1129 /* Do not dump I/O mapped devices or special mappings */ 1136 /* Do not dump I/O mapped devices or special mappings */
1130 if (vma->vm_flags & (VM_IO | VM_RESERVED)) 1137 if (vma->vm_flags & VM_IO)
1131 return 0; 1138 return 0;
1132 1139
1133 /* By default, dump shared memory if mapped from an anonymous file. */ 1140 /* By default, dump shared memory if mapped from an anonymous file. */
@@ -1372,6 +1379,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
1372 fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); 1379 fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
1373} 1380}
1374 1381
1382static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
1383 siginfo_t *siginfo)
1384{
1385 mm_segment_t old_fs = get_fs();
1386 set_fs(KERNEL_DS);
1387 copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
1388 set_fs(old_fs);
1389 fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
1390}
1391
1392#define MAX_FILE_NOTE_SIZE (4*1024*1024)
1393/*
1394 * Format of NT_FILE note:
1395 *
1396 * long count -- how many files are mapped
1397 * long page_size -- units for file_ofs
1398 * array of [COUNT] elements of
1399 * long start
1400 * long end
1401 * long file_ofs
1402 * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
1403 */
1404static void fill_files_note(struct memelfnote *note)
1405{
1406 struct vm_area_struct *vma;
1407 unsigned count, size, names_ofs, remaining, n;
1408 user_long_t *data;
1409 user_long_t *start_end_ofs;
1410 char *name_base, *name_curpos;
1411
1412 /* *Estimated* file count and total data size needed */
1413 count = current->mm->map_count;
1414 size = count * 64;
1415
1416 names_ofs = (2 + 3 * count) * sizeof(data[0]);
1417 alloc:
1418 if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
1419 goto err;
1420 size = round_up(size, PAGE_SIZE);
1421 data = vmalloc(size);
1422 if (!data)
1423 goto err;
1424
1425 start_end_ofs = data + 2;
1426 name_base = name_curpos = ((char *)data) + names_ofs;
1427 remaining = size - names_ofs;
1428 count = 0;
1429 for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
1430 struct file *file;
1431 const char *filename;
1432
1433 file = vma->vm_file;
1434 if (!file)
1435 continue;
1436 filename = d_path(&file->f_path, name_curpos, remaining);
1437 if (IS_ERR(filename)) {
1438 if (PTR_ERR(filename) == -ENAMETOOLONG) {
1439 vfree(data);
1440 size = size * 5 / 4;
1441 goto alloc;
1442 }
1443 continue;
1444 }
1445
1446 /* d_path() fills at the end, move name down */
1447 /* n = strlen(filename) + 1: */
1448 n = (name_curpos + remaining) - filename;
1449 remaining = filename - name_curpos;
1450 memmove(name_curpos, filename, n);
1451 name_curpos += n;
1452
1453 *start_end_ofs++ = vma->vm_start;
1454 *start_end_ofs++ = vma->vm_end;
1455 *start_end_ofs++ = vma->vm_pgoff;
1456 count++;
1457 }
1458
1459 /* Now we know exact count of files, can store it */
1460 data[0] = count;
1461 data[1] = PAGE_SIZE;
1462 /*
1463 * Count usually is less than current->mm->map_count,
1464 * we need to move filenames down.
1465 */
1466 n = current->mm->map_count - count;
1467 if (n != 0) {
1468 unsigned shift_bytes = n * 3 * sizeof(data[0]);
1469 memmove(name_base - shift_bytes, name_base,
1470 name_curpos - name_base);
1471 name_curpos -= shift_bytes;
1472 }
1473
1474 size = name_curpos - (char *)data;
1475 fill_note(note, "CORE", NT_FILE, size, data);
1476 err: ;
1477}
1478
1375#ifdef CORE_DUMP_USE_REGSET 1479#ifdef CORE_DUMP_USE_REGSET
1376#include <linux/regset.h> 1480#include <linux/regset.h>
1377 1481
@@ -1385,7 +1489,10 @@ struct elf_thread_core_info {
1385struct elf_note_info { 1489struct elf_note_info {
1386 struct elf_thread_core_info *thread; 1490 struct elf_thread_core_info *thread;
1387 struct memelfnote psinfo; 1491 struct memelfnote psinfo;
1492 struct memelfnote signote;
1388 struct memelfnote auxv; 1493 struct memelfnote auxv;
1494 struct memelfnote files;
1495 user_siginfo_t csigdata;
1389 size_t size; 1496 size_t size;
1390 int thread_notes; 1497 int thread_notes;
1391}; 1498};
@@ -1480,7 +1587,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
1480 1587
1481static int fill_note_info(struct elfhdr *elf, int phdrs, 1588static int fill_note_info(struct elfhdr *elf, int phdrs,
1482 struct elf_note_info *info, 1589 struct elf_note_info *info,
1483 long signr, struct pt_regs *regs) 1590 siginfo_t *siginfo, struct pt_regs *regs)
1484{ 1591{
1485 struct task_struct *dump_task = current; 1592 struct task_struct *dump_task = current;
1486 const struct user_regset_view *view = task_user_regset_view(dump_task); 1593 const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1550,7 +1657,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1550 * Now fill in each thread's information. 1657 * Now fill in each thread's information.
1551 */ 1658 */
1552 for (t = info->thread; t != NULL; t = t->next) 1659 for (t = info->thread; t != NULL; t = t->next)
1553 if (!fill_thread_core_info(t, view, signr, &info->size)) 1660 if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
1554 return 0; 1661 return 0;
1555 1662
1556 /* 1663 /*
@@ -1559,9 +1666,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1559 fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); 1666 fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
1560 info->size += notesize(&info->psinfo); 1667 info->size += notesize(&info->psinfo);
1561 1668
1669 fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
1670 info->size += notesize(&info->signote);
1671
1562 fill_auxv_note(&info->auxv, current->mm); 1672 fill_auxv_note(&info->auxv, current->mm);
1563 info->size += notesize(&info->auxv); 1673 info->size += notesize(&info->auxv);
1564 1674
1675 fill_files_note(&info->files);
1676 info->size += notesize(&info->files);
1677
1565 return 1; 1678 return 1;
1566} 1679}
1567 1680
@@ -1588,8 +1701,12 @@ static int write_note_info(struct elf_note_info *info,
1588 1701
1589 if (first && !writenote(&info->psinfo, file, foffset)) 1702 if (first && !writenote(&info->psinfo, file, foffset))
1590 return 0; 1703 return 0;
1704 if (first && !writenote(&info->signote, file, foffset))
1705 return 0;
1591 if (first && !writenote(&info->auxv, file, foffset)) 1706 if (first && !writenote(&info->auxv, file, foffset))
1592 return 0; 1707 return 0;
1708 if (first && !writenote(&info->files, file, foffset))
1709 return 0;
1593 1710
1594 for (i = 1; i < info->thread_notes; ++i) 1711 for (i = 1; i < info->thread_notes; ++i)
1595 if (t->notes[i].data && 1712 if (t->notes[i].data &&
@@ -1616,6 +1733,7 @@ static void free_note_info(struct elf_note_info *info)
1616 kfree(t); 1733 kfree(t);
1617 } 1734 }
1618 kfree(info->psinfo.data); 1735 kfree(info->psinfo.data);
1736 vfree(info->files.data);
1619} 1737}
1620 1738
1621#else 1739#else
@@ -1681,6 +1799,7 @@ struct elf_note_info {
1681#ifdef ELF_CORE_COPY_XFPREGS 1799#ifdef ELF_CORE_COPY_XFPREGS
1682 elf_fpxregset_t *xfpu; 1800 elf_fpxregset_t *xfpu;
1683#endif 1801#endif
1802 user_siginfo_t csigdata;
1684 int thread_status_size; 1803 int thread_status_size;
1685 int numnote; 1804 int numnote;
1686}; 1805};
@@ -1690,48 +1809,37 @@ static int elf_note_info_init(struct elf_note_info *info)
1690 memset(info, 0, sizeof(*info)); 1809 memset(info, 0, sizeof(*info));
1691 INIT_LIST_HEAD(&info->thread_list); 1810 INIT_LIST_HEAD(&info->thread_list);
1692 1811
1693 /* Allocate space for six ELF notes */ 1812 /* Allocate space for ELF notes */
1694 info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); 1813 info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL);
1695 if (!info->notes) 1814 if (!info->notes)
1696 return 0; 1815 return 0;
1697 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); 1816 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1698 if (!info->psinfo) 1817 if (!info->psinfo)
1699 goto notes_free; 1818 return 0;
1700 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); 1819 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1701 if (!info->prstatus) 1820 if (!info->prstatus)
1702 goto psinfo_free; 1821 return 0;
1703 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); 1822 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1704 if (!info->fpu) 1823 if (!info->fpu)
1705 goto prstatus_free; 1824 return 0;
1706#ifdef ELF_CORE_COPY_XFPREGS 1825#ifdef ELF_CORE_COPY_XFPREGS
1707 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); 1826 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1708 if (!info->xfpu) 1827 if (!info->xfpu)
1709 goto fpu_free; 1828 return 0;
1710#endif 1829#endif
1711 return 1; 1830 return 1;
1712#ifdef ELF_CORE_COPY_XFPREGS
1713 fpu_free:
1714 kfree(info->fpu);
1715#endif
1716 prstatus_free:
1717 kfree(info->prstatus);
1718 psinfo_free:
1719 kfree(info->psinfo);
1720 notes_free:
1721 kfree(info->notes);
1722 return 0;
1723} 1831}
1724 1832
1725static int fill_note_info(struct elfhdr *elf, int phdrs, 1833static int fill_note_info(struct elfhdr *elf, int phdrs,
1726 struct elf_note_info *info, 1834 struct elf_note_info *info,
1727 long signr, struct pt_regs *regs) 1835 siginfo_t *siginfo, struct pt_regs *regs)
1728{ 1836{
1729 struct list_head *t; 1837 struct list_head *t;
1730 1838
1731 if (!elf_note_info_init(info)) 1839 if (!elf_note_info_init(info))
1732 return 0; 1840 return 0;
1733 1841
1734 if (signr) { 1842 if (siginfo->si_signo) {
1735 struct core_thread *ct; 1843 struct core_thread *ct;
1736 struct elf_thread_status *ets; 1844 struct elf_thread_status *ets;
1737 1845
@@ -1749,13 +1857,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1749 int sz; 1857 int sz;
1750 1858
1751 ets = list_entry(t, struct elf_thread_status, list); 1859 ets = list_entry(t, struct elf_thread_status, list);
1752 sz = elf_dump_thread_status(signr, ets); 1860 sz = elf_dump_thread_status(siginfo->si_signo, ets);
1753 info->thread_status_size += sz; 1861 info->thread_status_size += sz;
1754 } 1862 }
1755 } 1863 }
1756 /* now collect the dump for the current */ 1864 /* now collect the dump for the current */
1757 memset(info->prstatus, 0, sizeof(*info->prstatus)); 1865 memset(info->prstatus, 0, sizeof(*info->prstatus));
1758 fill_prstatus(info->prstatus, current, signr); 1866 fill_prstatus(info->prstatus, current, siginfo->si_signo);
1759 elf_core_copy_regs(&info->prstatus->pr_reg, regs); 1867 elf_core_copy_regs(&info->prstatus->pr_reg, regs);
1760 1868
1761 /* Set up header */ 1869 /* Set up header */
@@ -1772,9 +1880,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1772 fill_note(info->notes + 1, "CORE", NT_PRPSINFO, 1880 fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
1773 sizeof(*info->psinfo), info->psinfo); 1881 sizeof(*info->psinfo), info->psinfo);
1774 1882
1775 info->numnote = 2; 1883 fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
1884 fill_auxv_note(info->notes + 3, current->mm);
1885 fill_files_note(info->notes + 4);
1776 1886
1777 fill_auxv_note(&info->notes[info->numnote++], current->mm); 1887 info->numnote = 5;
1778 1888
1779 /* Try to dump the FPU. */ 1889 /* Try to dump the FPU. */
1780 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, 1890 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1836,6 +1946,9 @@ static void free_note_info(struct elf_note_info *info)
1836 kfree(list_entry(tmp, struct elf_thread_status, list)); 1946 kfree(list_entry(tmp, struct elf_thread_status, list));
1837 } 1947 }
1838 1948
1949 /* Free data allocated by fill_files_note(): */
1950 vfree(info->notes[4].data);
1951
1839 kfree(info->prstatus); 1952 kfree(info->prstatus);
1840 kfree(info->psinfo); 1953 kfree(info->psinfo);
1841 kfree(info->notes); 1954 kfree(info->notes);
@@ -1962,7 +2075,7 @@ static int elf_core_dump(struct coredump_params *cprm)
1962 * Collect all the non-memory information about the process for the 2075 * Collect all the non-memory information about the process for the
1963 * notes. This also sets up the file header. 2076 * notes. This also sets up the file header.
1964 */ 2077 */
1965 if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) 2078 if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
1966 goto cleanup; 2079 goto cleanup;
1967 2080
1968 has_dumped = 1; 2081 has_dumped = 1;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3d77cf81ba3c..a46049154107 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -39,7 +39,6 @@
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/param.h> 40#include <asm/param.h>
41#include <asm/pgalloc.h> 41#include <asm/pgalloc.h>
42#include <asm/exec.h>
43 42
44typedef char *elf_caddr_t; 43typedef char *elf_caddr_t;
45 44
@@ -1205,7 +1204,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
1205 int dump_ok; 1204 int dump_ok;
1206 1205
1207 /* Do not dump I/O mapped devices or special mappings */ 1206 /* Do not dump I/O mapped devices or special mappings */
1208 if (vma->vm_flags & (VM_IO | VM_RESERVED)) { 1207 if (vma->vm_flags & VM_IO) {
1209 kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); 1208 kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
1210 return 0; 1209 return 0;
1211 } 1210 }
@@ -1642,7 +1641,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1642 goto cleanup; 1641 goto cleanup;
1643#endif 1642#endif
1644 1643
1645 if (cprm->signr) { 1644 if (cprm->siginfo->si_signo) {
1646 struct core_thread *ct; 1645 struct core_thread *ct;
1647 struct elf_thread_status *tmp; 1646 struct elf_thread_status *tmp;
1648 1647
@@ -1661,13 +1660,13 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1661 int sz; 1660 int sz;
1662 1661
1663 tmp = list_entry(t, struct elf_thread_status, list); 1662 tmp = list_entry(t, struct elf_thread_status, list);
1664 sz = elf_dump_thread_status(cprm->signr, tmp); 1663 sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
1665 thread_status_size += sz; 1664 thread_status_size += sz;
1666 } 1665 }
1667 } 1666 }
1668 1667
1669 /* now collect the dump for the current */ 1668 /* now collect the dump for the current */
1670 fill_prstatus(prstatus, current, cprm->signr); 1669 fill_prstatus(prstatus, current, cprm->siginfo->si_signo);
1671 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); 1670 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
1672 1671
1673 segs = current->mm->map_count; 1672 segs = current->mm->map_count;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 178cb70acc26..e280352b28f9 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -107,7 +107,7 @@ static struct linux_binfmt flat_format = {
107static int flat_core_dump(struct coredump_params *cprm) 107static int flat_core_dump(struct coredump_params *cprm)
108{ 108{
109 printk("Process %s:%d received signr %d and should have core dumped\n", 109 printk("Process %s:%d received signr %d and should have core dumped\n",
110 current->comm, current->pid, (int) cprm->signr); 110 current->comm, current->pid, (int) cprm->siginfo->si_signo);
111 return(1); 111 return(1);
112} 112}
113 113
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b9f61c..a3f28f331b2b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx)
70} 70}
71 71
72/** 72/**
73 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio 73 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
74 * @bio: bio to attach integrity metadata to 74 * @bio: bio to attach integrity metadata to
75 * @gfp_mask: Memory allocation mask 75 * @gfp_mask: Memory allocation mask
76 * @nr_vecs: Number of integrity metadata scatter-gather elements 76 * @nr_vecs: Number of integrity metadata scatter-gather elements
77 * @bs: bio_set to allocate from
78 * 77 *
79 * Description: This function prepares a bio for attaching integrity 78 * Description: This function prepares a bio for attaching integrity
80 * metadata. nr_vecs specifies the maximum number of pages containing 79 * metadata. nr_vecs specifies the maximum number of pages containing
81 * integrity metadata that can be attached. 80 * integrity metadata that can be attached.
82 */ 81 */
83struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, 82struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
84 gfp_t gfp_mask, 83 gfp_t gfp_mask,
85 unsigned int nr_vecs, 84 unsigned int nr_vecs)
86 struct bio_set *bs)
87{ 85{
88 struct bio_integrity_payload *bip; 86 struct bio_integrity_payload *bip;
89 unsigned int idx = vecs_to_idx(nr_vecs); 87 unsigned int idx = vecs_to_idx(nr_vecs);
88 struct bio_set *bs = bio->bi_pool;
89
90 if (!bs)
91 bs = fs_bio_set;
90 92
91 BUG_ON(bio == NULL); 93 BUG_ON(bio == NULL);
92 bip = NULL; 94 bip = NULL;
@@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
114 116
115 return bip; 117 return bip;
116} 118}
117EXPORT_SYMBOL(bio_integrity_alloc_bioset);
118
119/**
120 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
121 * @bio: bio to attach integrity metadata to
122 * @gfp_mask: Memory allocation mask
123 * @nr_vecs: Number of integrity metadata scatter-gather elements
124 *
125 * Description: This function prepares a bio for attaching integrity
126 * metadata. nr_vecs specifies the maximum number of pages containing
127 * integrity metadata that can be attached.
128 */
129struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
130 gfp_t gfp_mask,
131 unsigned int nr_vecs)
132{
133 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
134}
135EXPORT_SYMBOL(bio_integrity_alloc); 119EXPORT_SYMBOL(bio_integrity_alloc);
136 120
137/** 121/**
138 * bio_integrity_free - Free bio integrity payload 122 * bio_integrity_free - Free bio integrity payload
139 * @bio: bio containing bip to be freed 123 * @bio: bio containing bip to be freed
140 * @bs: bio_set this bio was allocated from
141 * 124 *
142 * Description: Used to free the integrity portion of a bio. Usually 125 * Description: Used to free the integrity portion of a bio. Usually
143 * called from bio_free(). 126 * called from bio_free().
144 */ 127 */
145void bio_integrity_free(struct bio *bio, struct bio_set *bs) 128void bio_integrity_free(struct bio *bio)
146{ 129{
147 struct bio_integrity_payload *bip = bio->bi_integrity; 130 struct bio_integrity_payload *bip = bio->bi_integrity;
131 struct bio_set *bs = bio->bi_pool;
132
133 if (!bs)
134 bs = fs_bio_set;
148 135
149 BUG_ON(bip == NULL); 136 BUG_ON(bip == NULL);
150 137
@@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split);
730 * @bio: New bio 717 * @bio: New bio
731 * @bio_src: Original bio 718 * @bio_src: Original bio
732 * @gfp_mask: Memory allocation mask 719 * @gfp_mask: Memory allocation mask
733 * @bs: bio_set to allocate bip from
734 * 720 *
735 * Description: Called to allocate a bip when cloning a bio 721 * Description: Called to allocate a bip when cloning a bio
736 */ 722 */
737int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 723int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
738 gfp_t gfp_mask, struct bio_set *bs) 724 gfp_t gfp_mask)
739{ 725{
740 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 726 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
741 struct bio_integrity_payload *bip; 727 struct bio_integrity_payload *bip;
742 728
743 BUG_ON(bip_src == NULL); 729 BUG_ON(bip_src == NULL);
744 730
745 bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); 731 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
746 732
747 if (bip == NULL) 733 if (bip == NULL)
748 return -EIO; 734 return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 71072ab99128..9298c65ad9c7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
55 * IO code that does not need private memory pools. 55 * IO code that does not need private memory pools.
56 */ 56 */
57struct bio_set *fs_bio_set; 57struct bio_set *fs_bio_set;
58EXPORT_SYMBOL(fs_bio_set);
58 59
59/* 60/*
60 * Our slab pool management 61 * Our slab pool management
@@ -233,26 +234,37 @@ fallback:
233 return bvl; 234 return bvl;
234} 235}
235 236
236void bio_free(struct bio *bio, struct bio_set *bs) 237static void __bio_free(struct bio *bio)
237{ 238{
239 bio_disassociate_task(bio);
240
241 if (bio_integrity(bio))
242 bio_integrity_free(bio);
243}
244
245static void bio_free(struct bio *bio)
246{
247 struct bio_set *bs = bio->bi_pool;
238 void *p; 248 void *p;
239 249
240 if (bio_has_allocated_vec(bio)) 250 __bio_free(bio);
241 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
242 251
243 if (bio_integrity(bio)) 252 if (bs) {
244 bio_integrity_free(bio, bs); 253 if (bio_has_allocated_vec(bio))
254 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
245 255
246 /* 256 /*
247 * If we have front padding, adjust the bio pointer before freeing 257 * If we have front padding, adjust the bio pointer before freeing
248 */ 258 */
249 p = bio; 259 p = bio;
250 if (bs->front_pad)
251 p -= bs->front_pad; 260 p -= bs->front_pad;
252 261
253 mempool_free(p, bs->bio_pool); 262 mempool_free(p, bs->bio_pool);
263 } else {
264 /* Bio was allocated by bio_kmalloc() */
265 kfree(bio);
266 }
254} 267}
255EXPORT_SYMBOL(bio_free);
256 268
257void bio_init(struct bio *bio) 269void bio_init(struct bio *bio)
258{ 270{
@@ -263,48 +275,85 @@ void bio_init(struct bio *bio)
263EXPORT_SYMBOL(bio_init); 275EXPORT_SYMBOL(bio_init);
264 276
265/** 277/**
278 * bio_reset - reinitialize a bio
279 * @bio: bio to reset
280 *
281 * Description:
282 * After calling bio_reset(), @bio will be in the same state as a freshly
283 * allocated bio returned bio bio_alloc_bioset() - the only fields that are
284 * preserved are the ones that are initialized by bio_alloc_bioset(). See
285 * comment in struct bio.
286 */
287void bio_reset(struct bio *bio)
288{
289 unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
290
291 __bio_free(bio);
292
293 memset(bio, 0, BIO_RESET_BYTES);
294 bio->bi_flags = flags|(1 << BIO_UPTODATE);
295}
296EXPORT_SYMBOL(bio_reset);
297
298/**
266 * bio_alloc_bioset - allocate a bio for I/O 299 * bio_alloc_bioset - allocate a bio for I/O
267 * @gfp_mask: the GFP_ mask given to the slab allocator 300 * @gfp_mask: the GFP_ mask given to the slab allocator
268 * @nr_iovecs: number of iovecs to pre-allocate 301 * @nr_iovecs: number of iovecs to pre-allocate
269 * @bs: the bio_set to allocate from. 302 * @bs: the bio_set to allocate from.
270 * 303 *
271 * Description: 304 * Description:
272 * bio_alloc_bioset will try its own mempool to satisfy the allocation. 305 * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
273 * If %__GFP_WAIT is set then we will block on the internal pool waiting 306 * backed by the @bs's mempool.
274 * for a &struct bio to become free.
275 * 307 *
276 * Note that the caller must set ->bi_destructor on successful return 308 * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
277 * of a bio, to do the appropriate freeing of the bio once the reference 309 * able to allocate a bio. This is due to the mempool guarantees. To make this
278 * count drops to zero. 310 * work, callers must never allocate more than 1 bio at a time from this pool.
279 **/ 311 * Callers that need to allocate more than 1 bio must always submit the
312 * previously allocated bio for IO before attempting to allocate a new one.
313 * Failure to do so can cause deadlocks under memory pressure.
314 *
315 * RETURNS:
316 * Pointer to new bio on success, NULL on failure.
317 */
280struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 318struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
281{ 319{
320 unsigned front_pad;
321 unsigned inline_vecs;
282 unsigned long idx = BIO_POOL_NONE; 322 unsigned long idx = BIO_POOL_NONE;
283 struct bio_vec *bvl = NULL; 323 struct bio_vec *bvl = NULL;
284 struct bio *bio; 324 struct bio *bio;
285 void *p; 325 void *p;
286 326
287 p = mempool_alloc(bs->bio_pool, gfp_mask); 327 if (!bs) {
328 if (nr_iovecs > UIO_MAXIOV)
329 return NULL;
330
331 p = kmalloc(sizeof(struct bio) +
332 nr_iovecs * sizeof(struct bio_vec),
333 gfp_mask);
334 front_pad = 0;
335 inline_vecs = nr_iovecs;
336 } else {
337 p = mempool_alloc(bs->bio_pool, gfp_mask);
338 front_pad = bs->front_pad;
339 inline_vecs = BIO_INLINE_VECS;
340 }
341
288 if (unlikely(!p)) 342 if (unlikely(!p))
289 return NULL; 343 return NULL;
290 bio = p + bs->front_pad;
291 344
345 bio = p + front_pad;
292 bio_init(bio); 346 bio_init(bio);
293 347
294 if (unlikely(!nr_iovecs)) 348 if (nr_iovecs > inline_vecs) {
295 goto out_set;
296
297 if (nr_iovecs <= BIO_INLINE_VECS) {
298 bvl = bio->bi_inline_vecs;
299 nr_iovecs = BIO_INLINE_VECS;
300 } else {
301 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 349 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
302 if (unlikely(!bvl)) 350 if (unlikely(!bvl))
303 goto err_free; 351 goto err_free;
304 352 } else if (nr_iovecs) {
305 nr_iovecs = bvec_nr_vecs(idx); 353 bvl = bio->bi_inline_vecs;
306 } 354 }
307out_set: 355
356 bio->bi_pool = bs;
308 bio->bi_flags |= idx << BIO_POOL_OFFSET; 357 bio->bi_flags |= idx << BIO_POOL_OFFSET;
309 bio->bi_max_vecs = nr_iovecs; 358 bio->bi_max_vecs = nr_iovecs;
310 bio->bi_io_vec = bvl; 359 bio->bi_io_vec = bvl;
@@ -316,80 +365,6 @@ err_free:
316} 365}
317EXPORT_SYMBOL(bio_alloc_bioset); 366EXPORT_SYMBOL(bio_alloc_bioset);
318 367
319static void bio_fs_destructor(struct bio *bio)
320{
321 bio_free(bio, fs_bio_set);
322}
323
324/**
325 * bio_alloc - allocate a new bio, memory pool backed
326 * @gfp_mask: allocation mask to use
327 * @nr_iovecs: number of iovecs
328 *
329 * bio_alloc will allocate a bio and associated bio_vec array that can hold
330 * at least @nr_iovecs entries. Allocations will be done from the
331 * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
332 *
333 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
334 * a bio. This is due to the mempool guarantees. To make this work, callers
335 * must never allocate more than 1 bio at a time from this pool. Callers
336 * that need to allocate more than 1 bio must always submit the previously
337 * allocated bio for IO before attempting to allocate a new one. Failure to
338 * do so can cause livelocks under memory pressure.
339 *
340 * RETURNS:
341 * Pointer to new bio on success, NULL on failure.
342 */
343struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
344{
345 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
346
347 if (bio)
348 bio->bi_destructor = bio_fs_destructor;
349
350 return bio;
351}
352EXPORT_SYMBOL(bio_alloc);
353
354static void bio_kmalloc_destructor(struct bio *bio)
355{
356 if (bio_integrity(bio))
357 bio_integrity_free(bio, fs_bio_set);
358 kfree(bio);
359}
360
361/**
362 * bio_kmalloc - allocate a bio for I/O using kmalloc()
363 * @gfp_mask: the GFP_ mask given to the slab allocator
364 * @nr_iovecs: number of iovecs to pre-allocate
365 *
366 * Description:
367 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
368 * %__GFP_WAIT, the allocation is guaranteed to succeed.
369 *
370 **/
371struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
372{
373 struct bio *bio;
374
375 if (nr_iovecs > UIO_MAXIOV)
376 return NULL;
377
378 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
379 gfp_mask);
380 if (unlikely(!bio))
381 return NULL;
382
383 bio_init(bio);
384 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
385 bio->bi_max_vecs = nr_iovecs;
386 bio->bi_io_vec = bio->bi_inline_vecs;
387 bio->bi_destructor = bio_kmalloc_destructor;
388
389 return bio;
390}
391EXPORT_SYMBOL(bio_kmalloc);
392
393void zero_fill_bio(struct bio *bio) 368void zero_fill_bio(struct bio *bio)
394{ 369{
395 unsigned long flags; 370 unsigned long flags;
@@ -420,11 +395,8 @@ void bio_put(struct bio *bio)
420 /* 395 /*
421 * last put frees it 396 * last put frees it
422 */ 397 */
423 if (atomic_dec_and_test(&bio->bi_cnt)) { 398 if (atomic_dec_and_test(&bio->bi_cnt))
424 bio_disassociate_task(bio); 399 bio_free(bio);
425 bio->bi_next = NULL;
426 bio->bi_destructor(bio);
427 }
428} 400}
429EXPORT_SYMBOL(bio_put); 401EXPORT_SYMBOL(bio_put);
430 402
@@ -466,26 +438,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
466EXPORT_SYMBOL(__bio_clone); 438EXPORT_SYMBOL(__bio_clone);
467 439
468/** 440/**
469 * bio_clone - clone a bio 441 * bio_clone_bioset - clone a bio
470 * @bio: bio to clone 442 * @bio: bio to clone
471 * @gfp_mask: allocation priority 443 * @gfp_mask: allocation priority
444 * @bs: bio_set to allocate from
472 * 445 *
473 * Like __bio_clone, only also allocates the returned bio 446 * Like __bio_clone, only also allocates the returned bio
474 */ 447 */
475struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 448struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
449 struct bio_set *bs)
476{ 450{
477 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 451 struct bio *b;
478 452
453 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
479 if (!b) 454 if (!b)
480 return NULL; 455 return NULL;
481 456
482 b->bi_destructor = bio_fs_destructor;
483 __bio_clone(b, bio); 457 __bio_clone(b, bio);
484 458
485 if (bio_integrity(bio)) { 459 if (bio_integrity(bio)) {
486 int ret; 460 int ret;
487 461
488 ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); 462 ret = bio_integrity_clone(b, bio, gfp_mask);
489 463
490 if (ret < 0) { 464 if (ret < 0) {
491 bio_put(b); 465 bio_put(b);
@@ -495,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
495 469
496 return b; 470 return b;
497} 471}
498EXPORT_SYMBOL(bio_clone); 472EXPORT_SYMBOL(bio_clone_bioset);
499 473
500/** 474/**
501 * bio_get_nr_vecs - return approx number of vecs 475 * bio_get_nr_vecs - return approx number of vecs
@@ -1501,7 +1475,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1501 trace_block_split(bdev_get_queue(bi->bi_bdev), bi, 1475 trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1502 bi->bi_sector + first_sectors); 1476 bi->bi_sector + first_sectors);
1503 1477
1504 BUG_ON(bi->bi_vcnt != 1); 1478 BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
1505 BUG_ON(bi->bi_idx != 0); 1479 BUG_ON(bi->bi_idx != 0);
1506 atomic_set(&bp->cnt, 3); 1480 atomic_set(&bp->cnt, 3);
1507 bp->error = 0; 1481 bp->error = 0;
@@ -1511,17 +1485,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1511 bp->bio2.bi_size -= first_sectors << 9; 1485 bp->bio2.bi_size -= first_sectors << 9;
1512 bp->bio1.bi_size = first_sectors << 9; 1486 bp->bio1.bi_size = first_sectors << 9;
1513 1487
1514 bp->bv1 = bi->bi_io_vec[0]; 1488 if (bi->bi_vcnt != 0) {
1515 bp->bv2 = bi->bi_io_vec[0]; 1489 bp->bv1 = bi->bi_io_vec[0];
1516 bp->bv2.bv_offset += first_sectors << 9; 1490 bp->bv2 = bi->bi_io_vec[0];
1517 bp->bv2.bv_len -= first_sectors << 9; 1491
1518 bp->bv1.bv_len = first_sectors << 9; 1492 if (bio_is_rw(bi)) {
1493 bp->bv2.bv_offset += first_sectors << 9;
1494 bp->bv2.bv_len -= first_sectors << 9;
1495 bp->bv1.bv_len = first_sectors << 9;
1496 }
1519 1497
1520 bp->bio1.bi_io_vec = &bp->bv1; 1498 bp->bio1.bi_io_vec = &bp->bv1;
1521 bp->bio2.bi_io_vec = &bp->bv2; 1499 bp->bio2.bi_io_vec = &bp->bv2;
1522 1500
1523 bp->bio1.bi_max_vecs = 1; 1501 bp->bio1.bi_max_vecs = 1;
1524 bp->bio2.bi_max_vecs = 1; 1502 bp->bio2.bi_max_vecs = 1;
1503 }
1525 1504
1526 bp->bio1.bi_end_io = bio_pair_end_1; 1505 bp->bio1.bi_end_io = bio_pair_end_1;
1527 bp->bio2.bi_end_io = bio_pair_end_2; 1506 bp->bio2.bi_end_io = bio_pair_end_2;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38e721b35d45..b3c1d3dae77d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
116 116
117int set_blocksize(struct block_device *bdev, int size) 117int set_blocksize(struct block_device *bdev, int size)
118{ 118{
119 struct address_space *mapping;
120
119 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 121 /* Size must be a power of two, and between 512 and PAGE_SIZE */
120 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
121 return -EINVAL; 123 return -EINVAL;
@@ -124,6 +126,19 @@ int set_blocksize(struct block_device *bdev, int size)
124 if (size < bdev_logical_block_size(bdev)) 126 if (size < bdev_logical_block_size(bdev))
125 return -EINVAL; 127 return -EINVAL;
126 128
129 /* Prevent starting I/O or mapping the device */
130 percpu_down_write(&bdev->bd_block_size_semaphore);
131
132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping;
134 mutex_lock(&mapping->i_mmap_mutex);
135 if (mapping_mapped(mapping)) {
136 mutex_unlock(&mapping->i_mmap_mutex);
137 percpu_up_write(&bdev->bd_block_size_semaphore);
138 return -EBUSY;
139 }
140 mutex_unlock(&mapping->i_mmap_mutex);
141
127 /* Don't change the size if it is same as current */ 142 /* Don't change the size if it is same as current */
128 if (bdev->bd_block_size != size) { 143 if (bdev->bd_block_size != size) {
129 sync_blockdev(bdev); 144 sync_blockdev(bdev);
@@ -131,6 +146,9 @@ int set_blocksize(struct block_device *bdev, int size)
131 bdev->bd_inode->i_blkbits = blksize_bits(size); 146 bdev->bd_inode->i_blkbits = blksize_bits(size);
132 kill_bdev(bdev); 147 kill_bdev(bdev);
133 } 148 }
149
150 percpu_up_write(&bdev->bd_block_size_semaphore);
151
134 return 0; 152 return 0;
135} 153}
136 154
@@ -441,6 +459,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
441 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 459 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
442 if (!ei) 460 if (!ei)
443 return NULL; 461 return NULL;
462
463 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
464 kmem_cache_free(bdev_cachep, ei);
465 return NULL;
466 }
467
444 return &ei->vfs_inode; 468 return &ei->vfs_inode;
445} 469}
446 470
@@ -449,6 +473,8 @@ static void bdev_i_callback(struct rcu_head *head)
449 struct inode *inode = container_of(head, struct inode, i_rcu); 473 struct inode *inode = container_of(head, struct inode, i_rcu);
450 struct bdev_inode *bdi = BDEV_I(inode); 474 struct bdev_inode *bdi = BDEV_I(inode);
451 475
476 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
477
452 kmem_cache_free(bdev_cachep, bdi); 478 kmem_cache_free(bdev_cachep, bdi);
453} 479}
454 480
@@ -1567,6 +1593,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1567 return blkdev_ioctl(bdev, mode, cmd, arg); 1593 return blkdev_ioctl(bdev, mode, cmd, arg);
1568} 1594}
1569 1595
1596ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1597 unsigned long nr_segs, loff_t pos)
1598{
1599 ssize_t ret;
1600 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1601
1602 percpu_down_read(&bdev->bd_block_size_semaphore);
1603
1604 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1605
1606 percpu_up_read(&bdev->bd_block_size_semaphore);
1607
1608 return ret;
1609}
1610EXPORT_SYMBOL_GPL(blkdev_aio_read);
1611
1570/* 1612/*
1571 * Write data to the block device. Only intended for the block device itself 1613 * Write data to the block device. Only intended for the block device itself
1572 * and the raw driver which basically is a fake block device. 1614 * and the raw driver which basically is a fake block device.
@@ -1578,12 +1620,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1578 unsigned long nr_segs, loff_t pos) 1620 unsigned long nr_segs, loff_t pos)
1579{ 1621{
1580 struct file *file = iocb->ki_filp; 1622 struct file *file = iocb->ki_filp;
1623 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1581 struct blk_plug plug; 1624 struct blk_plug plug;
1582 ssize_t ret; 1625 ssize_t ret;
1583 1626
1584 BUG_ON(iocb->ki_pos != pos); 1627 BUG_ON(iocb->ki_pos != pos);
1585 1628
1586 blk_start_plug(&plug); 1629 blk_start_plug(&plug);
1630
1631 percpu_down_read(&bdev->bd_block_size_semaphore);
1632
1587 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1633 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1588 if (ret > 0 || ret == -EIOCBQUEUED) { 1634 if (ret > 0 || ret == -EIOCBQUEUED) {
1589 ssize_t err; 1635 ssize_t err;
@@ -1592,11 +1638,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1592 if (err < 0 && ret > 0) 1638 if (err < 0 && ret > 0)
1593 ret = err; 1639 ret = err;
1594 } 1640 }
1641
1642 percpu_up_read(&bdev->bd_block_size_semaphore);
1643
1595 blk_finish_plug(&plug); 1644 blk_finish_plug(&plug);
1645
1596 return ret; 1646 return ret;
1597} 1647}
1598EXPORT_SYMBOL_GPL(blkdev_aio_write); 1648EXPORT_SYMBOL_GPL(blkdev_aio_write);
1599 1649
1650static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1651{
1652 int ret;
1653 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1654
1655 percpu_down_read(&bdev->bd_block_size_semaphore);
1656
1657 ret = generic_file_mmap(file, vma);
1658
1659 percpu_up_read(&bdev->bd_block_size_semaphore);
1660
1661 return ret;
1662}
1663
1600/* 1664/*
1601 * Try to release a page associated with block device when the system 1665 * Try to release a page associated with block device when the system
1602 * is under memory pressure. 1666 * is under memory pressure.
@@ -1627,9 +1691,9 @@ const struct file_operations def_blk_fops = {
1627 .llseek = block_llseek, 1691 .llseek = block_llseek,
1628 .read = do_sync_read, 1692 .read = do_sync_read,
1629 .write = do_sync_write, 1693 .write = do_sync_write,
1630 .aio_read = generic_file_aio_read, 1694 .aio_read = blkdev_aio_read,
1631 .aio_write = blkdev_aio_write, 1695 .aio_write = blkdev_aio_write,
1632 .mmap = generic_file_mmap, 1696 .mmap = blkdev_mmap,
1633 .fsync = blkdev_fsync, 1697 .fsync = blkdev_fsync,
1634 .unlocked_ioctl = block_ioctl, 1698 .unlocked_ioctl = block_ioctl,
1635#ifdef CONFIG_COMPAT 1699#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 761e2cd8fed1..0c16e3dbfd56 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 } 62 }
63 if (size > 0) { 63 if (size > 0) {
64 acl = posix_acl_from_xattr(value, size); 64 acl = posix_acl_from_xattr(&init_user_ns, value, size);
65 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
66 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
67 acl = NULL; 67 acl = NULL;
@@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
91 return PTR_ERR(acl); 91 return PTR_ERR(acl);
92 if (acl == NULL) 92 if (acl == NULL)
93 return -ENODATA; 93 return -ENODATA;
94 ret = posix_acl_to_xattr(acl, value, size); 94 ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
95 posix_acl_release(acl); 95 posix_acl_release(acl);
96 96
97 return ret; 97 return ret;
@@ -141,7 +141,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
141 goto out; 141 goto out;
142 } 142 }
143 143
144 ret = posix_acl_to_xattr(acl, value, size); 144 ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
145 if (ret < 0) 145 if (ret < 0)
146 goto out; 146 goto out;
147 } 147 }
@@ -169,7 +169,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
169 return -EOPNOTSUPP; 169 return -EOPNOTSUPP;
170 170
171 if (value) { 171 if (value) {
172 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(&init_user_ns, value, size);
173 if (IS_ERR(acl)) 173 if (IS_ERR(acl))
174 return PTR_ERR(acl); 174 return PTR_ERR(acl);
175 175
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..f3187938e081 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/vmalloc.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "disk-io.h" 21#include "disk-io.h"
21#include "backref.h" 22#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
231 } 232 }
232 if (!ret) { 233 if (!ret) {
233 ret = ulist_add(parents, eb->start, 234 ret = ulist_add(parents, eb->start,
234 (unsigned long)eie, GFP_NOFS); 235 (uintptr_t)eie, GFP_NOFS);
235 if (ret < 0) 236 if (ret < 0)
236 break; 237 break;
237 if (!extent_item_pos) { 238 if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
363 ULIST_ITER_INIT(&uiter); 364 ULIST_ITER_INIT(&uiter);
364 node = ulist_next(parents, &uiter); 365 node = ulist_next(parents, &uiter);
365 ref->parent = node ? node->val : 0; 366 ref->parent = node ? node->val : 0;
366 ref->inode_list = 367 ref->inode_list = node ?
367 node ? (struct extent_inode_elem *)node->aux : 0; 368 (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
368 369
369 /* additional parents require new refs being added here */ 370 /* additional parents require new refs being added here */
370 while ((node = ulist_next(parents, &uiter))) { 371 while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 } 376 }
376 memcpy(new_ref, ref, sizeof(*ref)); 377 memcpy(new_ref, ref, sizeof(*ref));
377 new_ref->parent = node->val; 378 new_ref->parent = node->val;
378 new_ref->inode_list = 379 new_ref->inode_list = (struct extent_inode_elem *)
379 (struct extent_inode_elem *)node->aux; 380 (uintptr_t)node->aux;
380 list_add(&new_ref->list, &ref->list); 381 list_add(&new_ref->list, &ref->list);
381 } 382 }
382 ulist_reinit(parents); 383 ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
914 free_extent_buffer(eb); 915 free_extent_buffer(eb);
915 } 916 }
916 ret = ulist_add_merge(refs, ref->parent, 917 ret = ulist_add_merge(refs, ref->parent,
917 (unsigned long)ref->inode_list, 918 (uintptr_t)ref->inode_list,
918 (unsigned long *)&eie, GFP_NOFS); 919 (u64 *)&eie, GFP_NOFS);
919 if (!ret && extent_item_pos) { 920 if (!ret && extent_item_pos) {
920 /* 921 /*
921 * we've recorded that parent, so we must extend 922 * we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
959 while ((node = ulist_next(blocks, &uiter))) { 960 while ((node = ulist_next(blocks, &uiter))) {
960 if (!node->aux) 961 if (!node->aux)
961 continue; 962 continue;
962 eie = (struct extent_inode_elem *)node->aux; 963 eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
963 for (; eie; eie = eie_next) { 964 for (; eie; eie = eie_next) {
964 eie_next = eie->next; 965 eie_next = eie->next;
965 kfree(eie); 966 kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1108 found_key); 1109 found_key);
1109} 1110}
1110 1111
1111/* 1112int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1112 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements 1113 u64 start_off, struct btrfs_path *path,
1113 * of the path are separated by '/' and the path is guaranteed to be 1114 struct btrfs_inode_extref **ret_extref,
1114 * 0-terminated. the path is only given within the current file system. 1115 u64 *found_off)
1115 * Therefore, it never starts with a '/'. the caller is responsible to provide 1116{
1116 * "size" bytes in "dest". the dest buffer will be filled backwards. finally, 1117 int ret, slot;
1117 * the start point of the resulting string is returned. this pointer is within 1118 struct btrfs_key key;
1118 * dest, normally. 1119 struct btrfs_key found_key;
1119 * in case the path buffer would overflow, the pointer is decremented further 1120 struct btrfs_inode_extref *extref;
1120 * as if output was written to the buffer, though no more output is actually 1121 struct extent_buffer *leaf;
1121 * generated. that way, the caller can determine how much space would be 1122 unsigned long ptr;
1122 * required for the path to fit into the buffer. in that case, the returned 1123
1123 * value will be smaller than dest. callers must check this! 1124 key.objectid = inode_objectid;
1124 */ 1125 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1126 key.offset = start_off;
1126 struct btrfs_inode_ref *iref, 1127
1128 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1129 if (ret < 0)
1130 return ret;
1131
1132 while (1) {
1133 leaf = path->nodes[0];
1134 slot = path->slots[0];
1135 if (slot >= btrfs_header_nritems(leaf)) {
1136 /*
1137 * If the item at offset is not found,
1138 * btrfs_search_slot will point us to the slot
1139 * where it should be inserted. In our case
1140 * that will be the slot directly before the
1141 * next INODE_REF_KEY_V2 item. In the case
1142 * that we're pointing to the last slot in a
1143 * leaf, we must move one leaf over.
1144 */
1145 ret = btrfs_next_leaf(root, path);
1146 if (ret) {
1147 if (ret >= 1)
1148 ret = -ENOENT;
1149 break;
1150 }
1151 continue;
1152 }
1153
1154 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1155
1156 /*
1157 * Check that we're still looking at an extended ref key for
1158 * this particular objectid. If we have different
1159 * objectid or type then there are no more to be found
1160 * in the tree and we can exit.
1161 */
1162 ret = -ENOENT;
1163 if (found_key.objectid != inode_objectid)
1164 break;
1165 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
1166 break;
1167
1168 ret = 0;
1169 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1170 extref = (struct btrfs_inode_extref *)ptr;
1171 *ret_extref = extref;
1172 if (found_off)
1173 *found_off = found_key.offset;
1174 break;
1175 }
1176
1177 return ret;
1178}
1179
1180static char *ref_to_path(struct btrfs_root *fs_root,
1181 struct btrfs_path *path,
1182 u32 name_len, unsigned long name_off,
1127 struct extent_buffer *eb_in, u64 parent, 1183 struct extent_buffer *eb_in, u64 parent,
1128 char *dest, u32 size) 1184 char *dest, u32 size)
1129{ 1185{
1130 u32 len;
1131 int slot; 1186 int slot;
1132 u64 next_inum; 1187 u64 next_inum;
1133 int ret; 1188 int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1135 struct extent_buffer *eb = eb_in; 1190 struct extent_buffer *eb = eb_in;
1136 struct btrfs_key found_key; 1191 struct btrfs_key found_key;
1137 int leave_spinning = path->leave_spinning; 1192 int leave_spinning = path->leave_spinning;
1193 struct btrfs_inode_ref *iref;
1138 1194
1139 if (bytes_left >= 0) 1195 if (bytes_left >= 0)
1140 dest[bytes_left] = '\0'; 1196 dest[bytes_left] = '\0';
1141 1197
1142 path->leave_spinning = 1; 1198 path->leave_spinning = 1;
1143 while (1) { 1199 while (1) {
1144 len = btrfs_inode_ref_name_len(eb, iref); 1200 bytes_left -= name_len;
1145 bytes_left -= len;
1146 if (bytes_left >= 0) 1201 if (bytes_left >= 0)
1147 read_extent_buffer(eb, dest + bytes_left, 1202 read_extent_buffer(eb, dest + bytes_left,
1148 (unsigned long)(iref + 1), len); 1203 name_off, name_len);
1149 if (eb != eb_in) { 1204 if (eb != eb_in) {
1150 btrfs_tree_read_unlock_blocking(eb); 1205 btrfs_tree_read_unlock_blocking(eb);
1151 free_extent_buffer(eb); 1206 free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1155 ret = -ENOENT; 1210 ret = -ENOENT;
1156 if (ret) 1211 if (ret)
1157 break; 1212 break;
1213
1158 next_inum = found_key.offset; 1214 next_inum = found_key.offset;
1159 1215
1160 /* regular exit ahead */ 1216 /* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1170 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1226 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1171 } 1227 }
1172 btrfs_release_path(path); 1228 btrfs_release_path(path);
1173
1174 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); 1229 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1230
1231 name_len = btrfs_inode_ref_name_len(eb, iref);
1232 name_off = (unsigned long)(iref + 1);
1233
1175 parent = next_inum; 1234 parent = next_inum;
1176 --bytes_left; 1235 --bytes_left;
1177 if (bytes_left >= 0) 1236 if (bytes_left >= 0)
@@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1188} 1247}
1189 1248
1190/* 1249/*
1250 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
1251 * of the path are separated by '/' and the path is guaranteed to be
1252 * 0-terminated. the path is only given within the current file system.
1253 * Therefore, it never starts with a '/'. the caller is responsible to provide
1254 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
1255 * the start point of the resulting string is returned. this pointer is within
1256 * dest, normally.
1257 * in case the path buffer would overflow, the pointer is decremented further
1258 * as if output was written to the buffer, though no more output is actually
1259 * generated. that way, the caller can determine how much space would be
1260 * required for the path to fit into the buffer. in that case, the returned
1261 * value will be smaller than dest. callers must check this!
1262 */
1263char *btrfs_iref_to_path(struct btrfs_root *fs_root,
1264 struct btrfs_path *path,
1265 struct btrfs_inode_ref *iref,
1266 struct extent_buffer *eb_in, u64 parent,
1267 char *dest, u32 size)
1268{
1269 return ref_to_path(fs_root, path,
1270 btrfs_inode_ref_name_len(eb_in, iref),
1271 (unsigned long)(iref + 1),
1272 eb_in, parent, dest, size);
1273}
1274
1275/*
1191 * this makes the path point to (logical EXTENT_ITEM *) 1276 * this makes the path point to (logical EXTENT_ITEM *)
1192 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for 1277 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
1193 * tree blocks and <0 on error. 1278 * tree blocks and <0 on error.
1194 */ 1279 */
1195int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 1280int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1196 struct btrfs_path *path, struct btrfs_key *found_key) 1281 struct btrfs_path *path, struct btrfs_key *found_key,
1282 u64 *flags_ret)
1197{ 1283{
1198 int ret; 1284 int ret;
1199 u64 flags; 1285 u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1237 (unsigned long long)found_key->objectid, 1323 (unsigned long long)found_key->objectid,
1238 (unsigned long long)found_key->offset, 1324 (unsigned long long)found_key->offset,
1239 (unsigned long long)flags, item_size); 1325 (unsigned long long)flags, item_size);
1240 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1326
1241 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 1327 WARN_ON(!flags_ret);
1242 if (flags & BTRFS_EXTENT_FLAG_DATA) 1328 if (flags_ret) {
1243 return BTRFS_EXTENT_FLAG_DATA; 1329 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1330 *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
1331 else if (flags & BTRFS_EXTENT_FLAG_DATA)
1332 *flags_ret = BTRFS_EXTENT_FLAG_DATA;
1333 else
1334 BUG_ON(1);
1335 return 0;
1336 }
1244 1337
1245 return -EIO; 1338 return -EIO;
1246} 1339}
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1404 ULIST_ITER_INIT(&root_uiter); 1497 ULIST_ITER_INIT(&root_uiter);
1405 while (!ret && (root_node = ulist_next(roots, &root_uiter))) { 1498 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1406 pr_debug("root %llu references leaf %llu, data list " 1499 pr_debug("root %llu references leaf %llu, data list "
1407 "%#lx\n", root_node->val, ref_node->val, 1500 "%#llx\n", root_node->val, ref_node->val,
1408 ref_node->aux); 1501 (long long)ref_node->aux);
1409 ret = iterate_leaf_refs( 1502 ret = iterate_leaf_refs((struct extent_inode_elem *)
1410 (struct extent_inode_elem *)ref_node->aux, 1503 (uintptr_t)ref_node->aux,
1411 root_node->val, extent_item_objectid, 1504 root_node->val,
1412 iterate, ctx); 1505 extent_item_objectid,
1506 iterate, ctx);
1413 } 1507 }
1414 ulist_free(roots); 1508 ulist_free(roots);
1415 roots = NULL; 1509 roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1432{ 1526{
1433 int ret; 1527 int ret;
1434 u64 extent_item_pos; 1528 u64 extent_item_pos;
1529 u64 flags = 0;
1435 struct btrfs_key found_key; 1530 struct btrfs_key found_key;
1436 int search_commit_root = path->search_commit_root; 1531 int search_commit_root = path->search_commit_root;
1437 1532
1438 ret = extent_from_logical(fs_info, logical, path, 1533 ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
1439 &found_key);
1440 btrfs_release_path(path); 1534 btrfs_release_path(path);
1441 if (ret < 0) 1535 if (ret < 0)
1442 return ret; 1536 return ret;
1443 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1537 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1444 return -EINVAL; 1538 return -EINVAL;
1445 1539
1446 extent_item_pos = logical - found_key.objectid; 1540 extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1451 return ret; 1545 return ret;
1452} 1546}
1453 1547
1454static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, 1548typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
1455 struct btrfs_path *path, 1549 struct extent_buffer *eb, void *ctx);
1456 iterate_irefs_t *iterate, void *ctx) 1550
1551static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1552 struct btrfs_path *path,
1553 iterate_irefs_t *iterate, void *ctx)
1457{ 1554{
1458 int ret = 0; 1555 int ret = 0;
1459 int slot; 1556 int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1470 while (!ret) { 1567 while (!ret) {
1471 path->leave_spinning = 1; 1568 path->leave_spinning = 1;
1472 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1569 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1473 &found_key); 1570 &found_key);
1474 if (ret < 0) 1571 if (ret < 0)
1475 break; 1572 break;
1476 if (ret) { 1573 if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1498 "tree %llu\n", cur, 1595 "tree %llu\n", cur,
1499 (unsigned long long)found_key.objectid, 1596 (unsigned long long)found_key.objectid,
1500 (unsigned long long)fs_root->objectid); 1597 (unsigned long long)fs_root->objectid);
1501 ret = iterate(parent, iref, eb, ctx); 1598 ret = iterate(parent, name_len,
1599 (unsigned long)(iref + 1), eb, ctx);
1502 if (ret) 1600 if (ret)
1503 break; 1601 break;
1504 len = sizeof(*iref) + name_len; 1602 len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1513 return ret; 1611 return ret;
1514} 1612}
1515 1613
1614static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1615 struct btrfs_path *path,
1616 iterate_irefs_t *iterate, void *ctx)
1617{
1618 int ret;
1619 int slot;
1620 u64 offset = 0;
1621 u64 parent;
1622 int found = 0;
1623 struct extent_buffer *eb;
1624 struct btrfs_inode_extref *extref;
1625 struct extent_buffer *leaf;
1626 u32 item_size;
1627 u32 cur_offset;
1628 unsigned long ptr;
1629
1630 while (1) {
1631 ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
1632 &offset);
1633 if (ret < 0)
1634 break;
1635 if (ret) {
1636 ret = found ? 0 : -ENOENT;
1637 break;
1638 }
1639 ++found;
1640
1641 slot = path->slots[0];
1642 eb = path->nodes[0];
1643 /* make sure we can use eb after releasing the path */
1644 atomic_inc(&eb->refs);
1645
1646 btrfs_tree_read_lock(eb);
1647 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1648 btrfs_release_path(path);
1649
1650 leaf = path->nodes[0];
1651 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1652 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1653 cur_offset = 0;
1654
1655 while (cur_offset < item_size) {
1656 u32 name_len;
1657
1658 extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
1659 parent = btrfs_inode_extref_parent(eb, extref);
1660 name_len = btrfs_inode_extref_name_len(eb, extref);
1661 ret = iterate(parent, name_len,
1662 (unsigned long)&extref->name, eb, ctx);
1663 if (ret)
1664 break;
1665
1666 cur_offset += btrfs_inode_extref_name_len(leaf, extref);
1667 cur_offset += sizeof(*extref);
1668 }
1669 btrfs_tree_read_unlock_blocking(eb);
1670 free_extent_buffer(eb);
1671
1672 offset++;
1673 }
1674
1675 btrfs_release_path(path);
1676
1677 return ret;
1678}
1679
1680static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1681 struct btrfs_path *path, iterate_irefs_t *iterate,
1682 void *ctx)
1683{
1684 int ret;
1685 int found_refs = 0;
1686
1687 ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
1688 if (!ret)
1689 ++found_refs;
1690 else if (ret != -ENOENT)
1691 return ret;
1692
1693 ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
1694 if (ret == -ENOENT && found_refs)
1695 return 0;
1696
1697 return ret;
1698}
1699
1516/* 1700/*
1517 * returns 0 if the path could be dumped (probably truncated) 1701 * returns 0 if the path could be dumped (probably truncated)
1518 * returns <0 in case of an error 1702 * returns <0 in case of an error
1519 */ 1703 */
1520static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, 1704static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
1521 struct extent_buffer *eb, void *ctx) 1705 struct extent_buffer *eb, void *ctx)
1522{ 1706{
1523 struct inode_fs_paths *ipath = ctx; 1707 struct inode_fs_paths *ipath = ctx;
1524 char *fspath; 1708 char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1531 ipath->fspath->bytes_left - s_ptr : 0; 1715 ipath->fspath->bytes_left - s_ptr : 0;
1532 1716
1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1717 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1718 fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
1535 inum, fspath_min, bytes_left); 1719 name_off, eb, inum, fspath_min,
1720 bytes_left);
1536 if (IS_ERR(fspath)) 1721 if (IS_ERR(fspath))
1537 return PTR_ERR(fspath); 1722 return PTR_ERR(fspath);
1538 1723
1539 if (fspath > fspath_min) { 1724 if (fspath > fspath_min) {
1540 pr_debug("path resolved: %s\n", fspath);
1541 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1725 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
1542 ++ipath->fspath->elem_cnt; 1726 ++ipath->fspath->elem_cnt;
1543 ipath->fspath->bytes_left = fspath - fspath_min; 1727 ipath->fspath->bytes_left = fspath - fspath_min;
1544 } else { 1728 } else {
1545 pr_debug("missed path, not enough space. missing bytes: %lu, "
1546 "constructed so far: %s\n",
1547 (unsigned long)(fspath_min - fspath), fspath_min);
1548 ++ipath->fspath->elem_missed; 1729 ++ipath->fspath->elem_missed;
1549 ipath->fspath->bytes_missing += fspath_min - fspath; 1730 ipath->fspath->bytes_missing += fspath_min - fspath;
1550 ipath->fspath->bytes_left = 0; 1731 ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1566int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) 1747int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
1567{ 1748{
1568 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, 1749 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
1569 inode_to_path, ipath); 1750 inode_to_path, ipath);
1570} 1751}
1571 1752
1572struct btrfs_data_container *init_data_container(u32 total_bytes) 1753struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
1575 size_t alloc_bytes; 1756 size_t alloc_bytes;
1576 1757
1577 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); 1758 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
1578 data = kmalloc(alloc_bytes, GFP_NOFS); 1759 data = vmalloc(alloc_bytes);
1579 if (!data) 1760 if (!data)
1580 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1581 1762
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
1626{ 1807{
1627 if (!ipath) 1808 if (!ipath)
1628 return; 1809 return;
1629 kfree(ipath->fspath); 1810 vfree(ipath->fspath);
1630 kfree(ipath); 1811 kfree(ipath);
1631} 1812}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..e75533043a5f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
33 33
34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, 34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
35 void *ctx); 35 void *ctx);
36typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
37 struct extent_buffer *eb, void *ctx);
38 36
39int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, 37int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
40 struct btrfs_path *path); 38 struct btrfs_path *path);
41 39
42int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 40int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
43 struct btrfs_path *path, struct btrfs_key *found_key); 41 struct btrfs_path *path, struct btrfs_key *found_key,
42 u64 *flags);
44 43
45int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 44int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
46 struct btrfs_extent_item *ei, u32 item_size, 45 struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
69 struct btrfs_path *path); 68 struct btrfs_path *path);
70void free_ipath(struct inode_fs_paths *ipath); 69void free_ipath(struct inode_fs_paths *ipath);
71 70
71int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
72 u64 start_off, struct btrfs_path *path,
73 struct btrfs_inode_extref **ret_extref,
74 u64 *found_off);
75
72#endif 76#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..ed8ca7ca5eff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4 38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
41 42
42/* in memory btrfs inode */ 43/* in memory btrfs inode */
43struct btrfs_inode { 44struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
143 /* flags field from the on disk inode */ 144 /* flags field from the on disk inode */
144 u32 flags; 145 u32 flags;
145 146
147 /* a local copy of root's last_log_commit */
148 unsigned long last_log_commit;
149
146 /* 150 /*
147 * Counters to keep track of the number of extent item's we may use due 151 * Counters to keep track of the number of extent item's we may use due
148 * to delalloc and such. outstanding_extents is the number of extent 152 * to delalloc and such. outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
202 206
203static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 207static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
204{ 208{
205 struct btrfs_root *root = BTRFS_I(inode)->root;
206 int ret = 0;
207
208 mutex_lock(&root->log_mutex);
209 if (BTRFS_I(inode)->logged_trans == generation && 209 if (BTRFS_I(inode)->logged_trans == generation &&
210 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 210 BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
211 ret = 1; 211 return 1;
212 mutex_unlock(&root->log_mutex); 212 return 0;
213 return ret;
214} 213}
215 214
216#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..5a3e45db642a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
37 * the file system was mounted, (i.e., they have been 37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been 38 * referenced by the super block) or they have been
39 * written since then and the write completion callback 39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where 40 * was called and no write error was indicated and a
41 * these blocks are located was received and completed. 41 * FLUSH request to the device where these blocks are
42 * located was received and completed.
42 * 2b. All referenced blocks need to have a generation 43 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number. 44 * number which is equal to the parent's number.
44 * 45 *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2601 (unsigned long long)l->block_ref_to->dev_bytenr, 2602 (unsigned long long)l->block_ref_to->dev_bytenr,
2602 l->block_ref_to->mirror_num); 2603 l->block_ref_to->mirror_num);
2603 ret = -1; 2604 ret = -1;
2605 } else if (l->block_ref_to->iodone_w_error) {
2606 printk(KERN_INFO "btrfs: attempt to write superblock"
2607 " which references block %c @%llu (%s/%llu/%d)"
2608 " which has write error!\n",
2609 btrfsic_get_block_type(state, l->block_ref_to),
2610 (unsigned long long)
2611 l->block_ref_to->logical_bytenr,
2612 l->block_ref_to->dev_state->name,
2613 (unsigned long long)l->block_ref_to->dev_bytenr,
2614 l->block_ref_to->mirror_num);
2615 ret = -1;
2604 } else if (l->parent_generation != 2616 } else if (l->parent_generation !=
2605 l->block_ref_to->generation && 2617 l->block_ref_to->generation &&
2606 BTRFSIC_GENERATION_UNKNOWN != 2618 BTRFSIC_GENERATION_UNKNOWN !=
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..c6467aa88bee 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 u64 em_start; 577 u64 em_start;
578 struct extent_map *em; 578 struct extent_map *em;
579 int ret = -ENOMEM; 579 int ret = -ENOMEM;
580 int faili = 0;
580 u32 *sums; 581 u32 *sums;
581 582
582 tree = &BTRFS_I(inode)->io_tree; 583 tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
626 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 627 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
627 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 628 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
628 __GFP_HIGHMEM); 629 __GFP_HIGHMEM);
629 if (!cb->compressed_pages[pg_index]) 630 if (!cb->compressed_pages[pg_index]) {
631 faili = pg_index - 1;
632 ret = -ENOMEM;
630 goto fail2; 633 goto fail2;
634 }
631 } 635 }
636 faili = nr_pages - 1;
632 cb->nr_pages = nr_pages; 637 cb->nr_pages = nr_pages;
633 638
634 add_ra_bio_pages(inode, em_start + em_len, cb); 639 add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
713 return 0; 718 return 0;
714 719
715fail2: 720fail2:
716 for (pg_index = 0; pg_index < nr_pages; pg_index++) 721 while (faili >= 0) {
717 free_page((unsigned long)cb->compressed_pages[pg_index]); 722 __free_page(cb->compressed_pages[faili]);
723 faili--;
724 }
718 725
719 kfree(cb->compressed_pages); 726 kfree(cb->compressed_pages);
720fail1: 727fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..b33436211000 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
4402} 4402}
4403 4403
4404/* 4404/*
4405 * Given a key and some data, insert items into the tree.
4406 * This does all the path init required, making room in the tree if needed.
4407 * Returns the number of keys that were inserted.
4408 */
4409int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4410 struct btrfs_root *root,
4411 struct btrfs_path *path,
4412 struct btrfs_key *cpu_key, u32 *data_size,
4413 int nr)
4414{
4415 struct extent_buffer *leaf;
4416 struct btrfs_item *item;
4417 int ret = 0;
4418 int slot;
4419 int i;
4420 u32 nritems;
4421 u32 total_data = 0;
4422 u32 total_size = 0;
4423 unsigned int data_end;
4424 struct btrfs_disk_key disk_key;
4425 struct btrfs_key found_key;
4426 struct btrfs_map_token token;
4427
4428 btrfs_init_map_token(&token);
4429
4430 for (i = 0; i < nr; i++) {
4431 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4432 BTRFS_LEAF_DATA_SIZE(root)) {
4433 break;
4434 nr = i;
4435 }
4436 total_data += data_size[i];
4437 total_size += data_size[i] + sizeof(struct btrfs_item);
4438 }
4439 BUG_ON(nr == 0);
4440
4441 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4442 if (ret == 0)
4443 return -EEXIST;
4444 if (ret < 0)
4445 goto out;
4446
4447 leaf = path->nodes[0];
4448
4449 nritems = btrfs_header_nritems(leaf);
4450 data_end = leaf_data_end(root, leaf);
4451
4452 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4453 for (i = nr; i >= 0; i--) {
4454 total_data -= data_size[i];
4455 total_size -= data_size[i] + sizeof(struct btrfs_item);
4456 if (total_size < btrfs_leaf_free_space(root, leaf))
4457 break;
4458 }
4459 nr = i;
4460 }
4461
4462 slot = path->slots[0];
4463 BUG_ON(slot < 0);
4464
4465 if (slot != nritems) {
4466 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4467
4468 item = btrfs_item_nr(leaf, slot);
4469 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4470
4471 /* figure out how many keys we can insert in here */
4472 total_data = data_size[0];
4473 for (i = 1; i < nr; i++) {
4474 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4475 break;
4476 total_data += data_size[i];
4477 }
4478 nr = i;
4479
4480 if (old_data < data_end) {
4481 btrfs_print_leaf(root, leaf);
4482 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4483 slot, old_data, data_end);
4484 BUG_ON(1);
4485 }
4486 /*
4487 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4488 */
4489 /* first correct the data pointers */
4490 for (i = slot; i < nritems; i++) {
4491 u32 ioff;
4492
4493 item = btrfs_item_nr(leaf, i);
4494 ioff = btrfs_token_item_offset(leaf, item, &token);
4495 btrfs_set_token_item_offset(leaf, item,
4496 ioff - total_data, &token);
4497 }
4498 /* shift the items */
4499 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4500 btrfs_item_nr_offset(slot),
4501 (nritems - slot) * sizeof(struct btrfs_item));
4502
4503 /* shift the data */
4504 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4505 data_end - total_data, btrfs_leaf_data(leaf) +
4506 data_end, old_data - data_end);
4507 data_end = old_data;
4508 } else {
4509 /*
4510 * this sucks but it has to be done, if we are inserting at
4511 * the end of the leaf only insert 1 of the items, since we
4512 * have no way of knowing whats on the next leaf and we'd have
4513 * to drop our current locks to figure it out
4514 */
4515 nr = 1;
4516 }
4517
4518 /* setup the item for the new data */
4519 for (i = 0; i < nr; i++) {
4520 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4521 btrfs_set_item_key(leaf, &disk_key, slot + i);
4522 item = btrfs_item_nr(leaf, slot + i);
4523 btrfs_set_token_item_offset(leaf, item,
4524 data_end - data_size[i], &token);
4525 data_end -= data_size[i];
4526 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4527 }
4528 btrfs_set_header_nritems(leaf, nritems + nr);
4529 btrfs_mark_buffer_dirty(leaf);
4530
4531 ret = 0;
4532 if (slot == 0) {
4533 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4534 fixup_low_keys(trans, root, path, &disk_key, 1);
4535 }
4536
4537 if (btrfs_leaf_free_space(root, leaf) < 0) {
4538 btrfs_print_leaf(root, leaf);
4539 BUG();
4540 }
4541out:
4542 if (!ret)
4543 ret = nr;
4544 return ret;
4545}
4546
4547/*
4548 * this is a helper for btrfs_insert_empty_items, the main goal here is 4405 * this is a helper for btrfs_insert_empty_items, the main goal here is
4549 * to save stack depth by doing the bulk of the work in a function 4406 * to save stack depth by doing the bulk of the work in a function
4550 * that doesn't call btrfs_search_slot 4407 * that doesn't call btrfs_search_slot
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
5073 struct btrfs_path *path, 4930 struct btrfs_path *path,
5074 int *level, int root_level) 4931 int *level, int root_level)
5075{ 4932{
4933 BUG_ON(*level == 0);
5076 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4934 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5077 path->slots[*level]); 4935 path->slots[*level]);
5078 path->slots[*level - 1] = 0; 4936 path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
5089 4947
5090 path->slots[*level]++; 4948 path->slots[*level]++;
5091 4949
5092 while (path->slots[*level] == nritems) { 4950 while (path->slots[*level] >= nritems) {
5093 if (*level == root_level) 4951 if (*level == root_level)
5094 return -1; 4952 return -1;
5095 4953
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5433 goto out; 5291 goto out;
5434 advance_right = ADVANCE; 5292 advance_right = ADVANCE;
5435 } else { 5293 } else {
5294 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5436 ret = tree_compare_item(left_root, left_path, 5295 ret = tree_compare_item(left_root, left_path,
5437 right_path, tmp_buf); 5296 right_path, tmp_buf);
5438 if (ret) { 5297 if (ret) {
5298 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5439 ret = changed_cb(left_root, right_root, 5299 ret = changed_cb(left_root, right_root,
5440 left_path, right_path, 5300 left_path, right_path,
5441 &left_key, 5301 &left_key,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d195b507660..926c9ffc66d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -116,7 +116,7 @@ struct btrfs_ordered_sum;
116#define BTRFS_FREE_SPACE_OBJECTID -11ULL 116#define BTRFS_FREE_SPACE_OBJECTID -11ULL
117 117
118/* 118/*
119 * The inode number assigned to the special inode for sotring 119 * The inode number assigned to the special inode for storing
120 * free ino cache 120 * free ino cache
121 */ 121 */
122#define BTRFS_FREE_INO_OBJECTID -12ULL 122#define BTRFS_FREE_INO_OBJECTID -12ULL
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
154 */ 154 */
155#define BTRFS_NAME_LEN 255 155#define BTRFS_NAME_LEN 255
156 156
157/*
158 * Theoretical limit is larger, but we keep this down to a sane
159 * value. That should limit greatly the possibility of collisions on
160 * inode ref items.
161 */
162#define BTRFS_LINK_MAX 65535U
163
157/* 32 bytes in various csum fields */ 164/* 32 bytes in various csum fields */
158#define BTRFS_CSUM_SIZE 32 165#define BTRFS_CSUM_SIZE 32
159 166
@@ -489,6 +496,8 @@ struct btrfs_super_block {
489 */ 496 */
490#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 497#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
491 498
499#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
500
492#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 501#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
493#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 502#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
494#define BTRFS_FEATURE_INCOMPAT_SUPP \ 503#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
496 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 505 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
497 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 506 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
498 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 507 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
499 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) 508 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
509 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
500 510
501/* 511/*
502 * A leaf is full of items. offset and size tell us where to find 512 * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
643 /* name goes here */ 653 /* name goes here */
644} __attribute__ ((__packed__)); 654} __attribute__ ((__packed__));
645 655
656struct btrfs_inode_extref {
657 __le64 parent_objectid;
658 __le64 index;
659 __le16 name_len;
660 __u8 name[0];
661 /* name goes here */
662} __attribute__ ((__packed__));
663
646struct btrfs_timespec { 664struct btrfs_timespec {
647 __le64 sec; 665 __le64 sec;
648 __le32 nsec; 666 __le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
1028 wait_queue_head_t wait; 1046 wait_queue_head_t wait;
1029}; 1047};
1030 1048
1049#define BTRFS_BLOCK_RSV_GLOBAL 1
1050#define BTRFS_BLOCK_RSV_DELALLOC 2
1051#define BTRFS_BLOCK_RSV_TRANS 3
1052#define BTRFS_BLOCK_RSV_CHUNK 4
1053#define BTRFS_BLOCK_RSV_DELOPS 5
1054#define BTRFS_BLOCK_RSV_EMPTY 6
1055#define BTRFS_BLOCK_RSV_TEMP 7
1056
1031struct btrfs_block_rsv { 1057struct btrfs_block_rsv {
1032 u64 size; 1058 u64 size;
1033 u64 reserved; 1059 u64 reserved;
1034 struct btrfs_space_info *space_info; 1060 struct btrfs_space_info *space_info;
1035 spinlock_t lock; 1061 spinlock_t lock;
1036 unsigned int full; 1062 unsigned short full;
1063 unsigned short type;
1064 unsigned short failfast;
1037}; 1065};
1038 1066
1039/* 1067/*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
1127 * Today it will only have one thing on it, but that may change 1155 * Today it will only have one thing on it, but that may change
1128 */ 1156 */
1129 struct list_head cluster_list; 1157 struct list_head cluster_list;
1158
1159 /* For delayed block group creation */
1160 struct list_head new_bg_list;
1130}; 1161};
1131 1162
1132/* delayed seq elem */ 1163/* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
1240 struct mutex reloc_mutex; 1271 struct mutex reloc_mutex;
1241 1272
1242 struct list_head trans_list; 1273 struct list_head trans_list;
1243 struct list_head hashers;
1244 struct list_head dead_roots; 1274 struct list_head dead_roots;
1245 struct list_head caching_block_groups; 1275 struct list_head caching_block_groups;
1246 1276
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
1366 struct rb_root defrag_inodes; 1396 struct rb_root defrag_inodes;
1367 atomic_t defrag_running; 1397 atomic_t defrag_running;
1368 1398
1369 spinlock_t ref_cache_lock;
1370 u64 total_ref_cache_size;
1371
1372 /* 1399 /*
1373 * these three are in extended format (availability of single 1400 * these three are in extended format (availability of single
1374 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1401 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
1441 1468
1442 /* next backup root to be overwritten */ 1469 /* next backup root to be overwritten */
1443 int backup_root_index; 1470 int backup_root_index;
1471
1472 int num_tolerated_disk_barrier_failures;
1444}; 1473};
1445 1474
1446/* 1475/*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
1481 wait_queue_head_t log_commit_wait[2]; 1510 wait_queue_head_t log_commit_wait[2];
1482 atomic_t log_writers; 1511 atomic_t log_writers;
1483 atomic_t log_commit[2]; 1512 atomic_t log_commit[2];
1513 atomic_t log_batch;
1484 unsigned long log_transid; 1514 unsigned long log_transid;
1485 unsigned long last_log_commit; 1515 unsigned long last_log_commit;
1486 unsigned long log_batch;
1487 pid_t log_start_pid; 1516 pid_t log_start_pid;
1488 bool log_multiple_pids; 1517 bool log_multiple_pids;
1489 1518
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
1592 */ 1621 */
1593#define BTRFS_INODE_ITEM_KEY 1 1622#define BTRFS_INODE_ITEM_KEY 1
1594#define BTRFS_INODE_REF_KEY 12 1623#define BTRFS_INODE_REF_KEY 12
1624#define BTRFS_INODE_EXTREF_KEY 13
1595#define BTRFS_XATTR_ITEM_KEY 24 1625#define BTRFS_XATTR_ITEM_KEY 24
1596#define BTRFS_ORPHAN_ITEM_KEY 48 1626#define BTRFS_ORPHAN_ITEM_KEY 48
1597/* reserve 2-15 close to the inode for later flexibility */ 1627/* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1978BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); 2008BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1979BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); 2009BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1980 2010
2011/* struct btrfs_inode_extref */
2012BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
2013 parent_objectid, 64);
2014BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
2015 name_len, 16);
2016BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
2017
1981/* struct btrfs_inode_item */ 2018/* struct btrfs_inode_item */
1982BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); 2019BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1983BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); 2020BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2858 u64 size); 2895 u64 size);
2859int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2896int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2860 struct btrfs_root *root, u64 group_start); 2897 struct btrfs_root *root, u64 group_start);
2898void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2899 struct btrfs_root *root);
2861u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2862u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2863void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2874void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 2913void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2875int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 2914int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2876void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); 2915void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2877void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); 2916void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2878struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2917struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2918 unsigned short type);
2879void btrfs_free_block_rsv(struct btrfs_root *root, 2919void btrfs_free_block_rsv(struct btrfs_root *root,
2880 struct btrfs_block_rsv *rsv); 2920 struct btrfs_block_rsv *rsv);
2881int btrfs_block_rsv_add(struct btrfs_root *root, 2921int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
3172 struct btrfs_root *root, 3212 struct btrfs_root *root,
3173 const char *name, int name_len, 3213 const char *name, int name_len,
3174 u64 inode_objectid, u64 ref_objectid, u64 *index); 3214 u64 inode_objectid, u64 ref_objectid, u64 *index);
3175struct btrfs_inode_ref * 3215int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
3176btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 3216 struct btrfs_root *root,
3177 struct btrfs_root *root, 3217 struct btrfs_path *path,
3178 struct btrfs_path *path, 3218 const char *name, int name_len,
3179 const char *name, int name_len, 3219 u64 inode_objectid, u64 ref_objectid, int mod,
3180 u64 inode_objectid, u64 ref_objectid, int mod); 3220 u64 *ret_index);
3181int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 3221int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
3182 struct btrfs_root *root, 3222 struct btrfs_root *root,
3183 struct btrfs_path *path, u64 objectid); 3223 struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
3185 *root, struct btrfs_path *path, 3225 *root, struct btrfs_path *path,
3186 struct btrfs_key *location, int mod); 3226 struct btrfs_key *location, int mod);
3187 3227
3228struct btrfs_inode_extref *
3229btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
3230 struct btrfs_root *root,
3231 struct btrfs_path *path,
3232 const char *name, int name_len,
3233 u64 inode_objectid, u64 ref_objectid, int ins_len,
3234 int cow);
3235
3236int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
3237 u64 ref_objectid, const char *name,
3238 int name_len,
3239 struct btrfs_inode_extref **extref_ret);
3240
3188/* file-item.c */ 3241/* file-item.c */
3189int btrfs_del_csums(struct btrfs_trans_handle *trans, 3242int btrfs_del_csums(struct btrfs_trans_handle *trans,
3190 struct btrfs_root *root, u64 bytenr, u64 len); 3243 struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3249 struct btrfs_root *root, 3302 struct btrfs_root *root,
3250 struct inode *dir, u64 objectid, 3303 struct inode *dir, u64 objectid,
3251 const char *name, int name_len); 3304 const char *name, int name_len);
3305int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3306 int front);
3252int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3307int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3253 struct btrfs_root *root, 3308 struct btrfs_root *root,
3254 struct inode *inode, u64 new_size, 3309 struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
3308int btrfs_defrag_file(struct inode *inode, struct file *file, 3363int btrfs_defrag_file(struct inode *inode, struct file *file,
3309 struct btrfs_ioctl_defrag_range_args *range, 3364 struct btrfs_ioctl_defrag_range_args *range,
3310 u64 newer_than, unsigned long max_pages); 3365 u64 newer_than, unsigned long max_pages);
3366void btrfs_get_block_group_info(struct list_head *groups_list,
3367 struct btrfs_ioctl_space_info *space);
3368
3311/* file.c */ 3369/* file.c */
3312int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3370int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3313 struct inode *inode); 3371 struct inode *inode);
3314int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3372int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3315int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3373int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3316int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3374void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3317 int skip_pinned); 3375 int skip_pinned);
3376int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
3377 u64 start, u64 end, int skip_pinned,
3378 int modified);
3318extern const struct file_operations btrfs_file_operations; 3379extern const struct file_operations btrfs_file_operations;
3319int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 3380int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3320 u64 start, u64 end, u64 *hint_byte, int drop_cache); 3381 struct btrfs_root *root, struct inode *inode,
3382 struct btrfs_path *path, u64 start, u64 end,
3383 u64 *drop_end, int drop_cache);
3384int btrfs_drop_extents(struct btrfs_trans_handle *trans,
3385 struct btrfs_root *root, struct inode *inode, u64 start,
3386 u64 end, int drop_cache);
3321int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 3387int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
3322 struct inode *inode, u64 start, u64 end); 3388 struct inode *inode, u64 start, u64 end);
3323int btrfs_release_file(struct inode *inode, struct file *file); 3389int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3378 } 3444 }
3379} 3445}
3380 3446
3447/*
3448 * Call btrfs_abort_transaction as early as possible when an error condition is
3449 * detected, that way the exact line number is reported.
3450 */
3451
3381#define btrfs_abort_transaction(trans, root, errno) \ 3452#define btrfs_abort_transaction(trans, root, errno) \
3382do { \ 3453do { \
3383 __btrfs_abort_transaction(trans, root, __func__, \ 3454 __btrfs_abort_transaction(trans, root, __func__, \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 07d5eeb1e6f1..478f66bdc57b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
29 29
30int __init btrfs_delayed_inode_init(void) 30int __init btrfs_delayed_inode_init(void)
31{ 31{
32 delayed_node_cache = kmem_cache_create("delayed_node", 32 delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
33 sizeof(struct btrfs_delayed_node), 33 sizeof(struct btrfs_delayed_node),
34 0, 34 0,
35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
650 * we're accounted for. 650 * we're accounted for.
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv != &root->fs_info->delalloc_block_rsv)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
655 /* 655 /*
656 * Since we're under a transaction reserve_metadata_bytes could 656 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
668 num_bytes, 1); 668 num_bytes, 1);
669 } 669 }
670 return ret; 670 return ret;
671 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 671 } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
672 spin_lock(&BTRFS_I(inode)->lock); 672 spin_lock(&BTRFS_I(inode)->lock);
673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
674 &BTRFS_I(inode)->runtime_flags)) { 674 &BTRFS_I(inode)->runtime_flags)) {
@@ -1715,8 +1715,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1715 struct btrfs_inode_item *inode_item, 1715 struct btrfs_inode_item *inode_item,
1716 struct inode *inode) 1716 struct inode *inode)
1717{ 1717{
1718 btrfs_set_stack_inode_uid(inode_item, inode->i_uid); 1718 btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
1719 btrfs_set_stack_inode_gid(inode_item, inode->i_gid); 1719 btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
1720 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); 1720 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1721 btrfs_set_stack_inode_mode(inode_item, inode->i_mode); 1721 btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1722 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); 1722 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
@@ -1764,8 +1764,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1764 1764
1765 inode_item = &delayed_node->inode_item; 1765 inode_item = &delayed_node->inode_item;
1766 1766
1767 inode->i_uid = btrfs_stack_inode_uid(inode_item); 1767 i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
1768 inode->i_gid = btrfs_stack_inode_gid(inode_item); 1768 i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
1769 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); 1769 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1770 inode->i_mode = btrfs_stack_inode_mode(inode_item); 1770 inode->i_mode = btrfs_stack_inode_mode(inode_item);
1771 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1771 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ab5300595847..c9d703693df0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,7 +18,7 @@
18#ifndef __DELAYED_REF__ 18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__ 19#define __DELAYED_REF__
20 20
21/* these are the possible values of struct btrfs_delayed_ref->action */ 21/* these are the possible values of struct btrfs_delayed_ref_node->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ 22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ 23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ 24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..7cda51995c1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48 48
49#ifdef CONFIG_X86
50#include <asm/cpufeature.h>
51#endif
52
49static struct extent_io_ops btree_extent_io_ops; 53static struct extent_io_ops btree_extent_io_ops;
50static void end_workqueue_fn(struct btrfs_work *work); 54static void end_workqueue_fn(struct btrfs_work *work);
51static void free_fs_root(struct btrfs_root *root); 55static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
217 write_lock(&em_tree->lock); 221 write_lock(&em_tree->lock);
218 ret = add_extent_mapping(em_tree, em); 222 ret = add_extent_mapping(em_tree, em);
219 if (ret == -EEXIST) { 223 if (ret == -EEXIST) {
220 u64 failed_start = em->start;
221 u64 failed_len = em->len;
222
223 free_extent_map(em); 224 free_extent_map(em);
224 em = lookup_extent_mapping(em_tree, start, len); 225 em = lookup_extent_mapping(em_tree, start, len);
225 if (em) { 226 if (!em)
226 ret = 0; 227 em = ERR_PTR(-EIO);
227 } else {
228 em = lookup_extent_mapping(em_tree, failed_start,
229 failed_len);
230 ret = -EIO;
231 }
232 } else if (ret) { 228 } else if (ret) {
233 free_extent_map(em); 229 free_extent_map(em);
234 em = NULL; 230 em = ERR_PTR(ret);
235 } 231 }
236 write_unlock(&em_tree->lock); 232 write_unlock(&em_tree->lock);
237 233
238 if (ret)
239 em = ERR_PTR(ret);
240out: 234out:
241 return em; 235 return em;
242} 236}
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
439 WARN_ON(1); 433 WARN_ON(1);
440 return 0; 434 return 0;
441 } 435 }
442 if (eb->pages[0] != page) {
443 WARN_ON(1);
444 return 0;
445 }
446 if (!PageUptodate(page)) { 436 if (!PageUptodate(page)) {
447 WARN_ON(1); 437 WARN_ON(1);
448 return 0; 438 return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
869 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
870} 860}
871 861
862static int check_async_write(struct inode *inode, unsigned long bio_flags)
863{
864 if (bio_flags & EXTENT_BIO_TREE_LOG)
865 return 0;
866#ifdef CONFIG_X86
867 if (cpu_has_xmm4_2)
868 return 0;
869#endif
870 return 1;
871}
872
872static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 873static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
873 int mirror_num, unsigned long bio_flags, 874 int mirror_num, unsigned long bio_flags,
874 u64 bio_offset) 875 u64 bio_offset)
875{ 876{
877 int async = check_async_write(inode, bio_flags);
876 int ret; 878 int ret;
877 879
878 if (!(rw & REQ_WRITE)) { 880 if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
887 return ret; 889 return ret;
888 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
889 mirror_num, 0); 891 mirror_num, 0);
892 } else if (!async) {
893 ret = btree_csum_one_bio(bio);
894 if (ret)
895 return ret;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0);
890 } 898 }
891 899
892 /* 900 /*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1168 atomic_set(&root->log_commit[0], 0); 1176 atomic_set(&root->log_commit[0], 0);
1169 atomic_set(&root->log_commit[1], 0); 1177 atomic_set(&root->log_commit[1], 0);
1170 atomic_set(&root->log_writers, 0); 1178 atomic_set(&root->log_writers, 0);
1179 atomic_set(&root->log_batch, 0);
1171 atomic_set(&root->orphan_inodes, 0); 1180 atomic_set(&root->orphan_inodes, 0);
1172 root->log_batch = 0;
1173 root->log_transid = 0; 1181 root->log_transid = 0;
1174 root->last_log_commit = 0; 1182 root->last_log_commit = 0;
1175 extent_io_tree_init(&root->dirty_log_pages, 1183 extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
1667 spin_unlock(&root->fs_info->trans_lock); 1675 spin_unlock(&root->fs_info->trans_lock);
1668 1676
1669 /* If the file system is aborted, this will always fail. */ 1677 /* If the file system is aborted, this will always fail. */
1670 trans = btrfs_join_transaction(root); 1678 trans = btrfs_attach_transaction(root);
1671 if (IS_ERR(trans)) { 1679 if (IS_ERR(trans)) {
1672 cannot_commit = true; 1680 if (PTR_ERR(trans) != -ENOENT)
1681 cannot_commit = true;
1673 goto sleep; 1682 goto sleep;
1674 } 1683 }
1675 if (transid == trans->transid) { 1684 if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
1994 INIT_LIST_HEAD(&fs_info->trans_list); 2003 INIT_LIST_HEAD(&fs_info->trans_list);
1995 INIT_LIST_HEAD(&fs_info->dead_roots); 2004 INIT_LIST_HEAD(&fs_info->dead_roots);
1996 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2005 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1997 INIT_LIST_HEAD(&fs_info->hashers);
1998 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2006 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1999 INIT_LIST_HEAD(&fs_info->ordered_operations); 2007 INIT_LIST_HEAD(&fs_info->ordered_operations);
2000 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2008 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2001 spin_lock_init(&fs_info->delalloc_lock); 2009 spin_lock_init(&fs_info->delalloc_lock);
2002 spin_lock_init(&fs_info->trans_lock); 2010 spin_lock_init(&fs_info->trans_lock);
2003 spin_lock_init(&fs_info->ref_cache_lock);
2004 spin_lock_init(&fs_info->fs_roots_radix_lock); 2011 spin_lock_init(&fs_info->fs_roots_radix_lock);
2005 spin_lock_init(&fs_info->delayed_iput_lock); 2012 spin_lock_init(&fs_info->delayed_iput_lock);
2006 spin_lock_init(&fs_info->defrag_inodes_lock); 2013 spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
2014 INIT_LIST_HEAD(&fs_info->space_info); 2021 INIT_LIST_HEAD(&fs_info->space_info);
2015 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2022 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2016 btrfs_mapping_init(&fs_info->mapping_tree); 2023 btrfs_mapping_init(&fs_info->mapping_tree);
2017 btrfs_init_block_rsv(&fs_info->global_block_rsv); 2024 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2018 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 2025 BTRFS_BLOCK_RSV_GLOBAL);
2019 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 2026 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2020 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 2027 BTRFS_BLOCK_RSV_DELALLOC);
2021 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 2028 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2022 btrfs_init_block_rsv(&fs_info->delayed_block_rsv); 2029 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2030 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2031 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2032 BTRFS_BLOCK_RSV_DELOPS);
2023 atomic_set(&fs_info->nr_async_submits, 0); 2033 atomic_set(&fs_info->nr_async_submits, 0);
2024 atomic_set(&fs_info->async_delalloc_pages, 0); 2034 atomic_set(&fs_info->async_delalloc_pages, 0);
2025 atomic_set(&fs_info->async_submit_draining, 0); 2035 atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
2491 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2501 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2492 goto fail_block_groups; 2502 goto fail_block_groups;
2493 } 2503 }
2504 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2494 2506
2495 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2496 "btrfs-cleaner"); 2508 "btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2874 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2886 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2875 rcu_str_deref(device->name)); 2887 rcu_str_deref(device->name));
2876 device->nobarriers = 1; 2888 device->nobarriers = 1;
2877 } 2889 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2878 if (!bio_flagged(bio, BIO_UPTODATE)) {
2879 ret = -EIO; 2890 ret = -EIO;
2880 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2891 btrfs_dev_stat_inc_and_print(device,
2881 btrfs_dev_stat_inc_and_print(device, 2892 BTRFS_DEV_STAT_FLUSH_ERRS);
2882 BTRFS_DEV_STAT_FLUSH_ERRS);
2883 } 2893 }
2884 2894
2885 /* drop the reference from the wait == 0 run */ 2895 /* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2918{ 2928{
2919 struct list_head *head; 2929 struct list_head *head;
2920 struct btrfs_device *dev; 2930 struct btrfs_device *dev;
2921 int errors = 0; 2931 int errors_send = 0;
2932 int errors_wait = 0;
2922 int ret; 2933 int ret;
2923 2934
2924 /* send down all the barriers */ 2935 /* send down all the barriers */
2925 head = &info->fs_devices->devices; 2936 head = &info->fs_devices->devices;
2926 list_for_each_entry_rcu(dev, head, dev_list) { 2937 list_for_each_entry_rcu(dev, head, dev_list) {
2927 if (!dev->bdev) { 2938 if (!dev->bdev) {
2928 errors++; 2939 errors_send++;
2929 continue; 2940 continue;
2930 } 2941 }
2931 if (!dev->in_fs_metadata || !dev->writeable) 2942 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2933 2944
2934 ret = write_dev_flush(dev, 0); 2945 ret = write_dev_flush(dev, 0);
2935 if (ret) 2946 if (ret)
2936 errors++; 2947 errors_send++;
2937 } 2948 }
2938 2949
2939 /* wait for all the barriers */ 2950 /* wait for all the barriers */
2940 list_for_each_entry_rcu(dev, head, dev_list) { 2951 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 2952 if (!dev->bdev) {
2942 errors++; 2953 errors_wait++;
2943 continue; 2954 continue;
2944 } 2955 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 2956 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 2958
2948 ret = write_dev_flush(dev, 1); 2959 ret = write_dev_flush(dev, 1);
2949 if (ret) 2960 if (ret)
2950 errors++; 2961 errors_wait++;
2951 } 2962 }
2952 if (errors) 2963 if (errors_send > info->num_tolerated_disk_barrier_failures ||
2964 errors_wait > info->num_tolerated_disk_barrier_failures)
2953 return -EIO; 2965 return -EIO;
2954 return 0; 2966 return 0;
2955} 2967}
2956 2968
2969int btrfs_calc_num_tolerated_disk_barrier_failures(
2970 struct btrfs_fs_info *fs_info)
2971{
2972 struct btrfs_ioctl_space_info space;
2973 struct btrfs_space_info *sinfo;
2974 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2975 BTRFS_BLOCK_GROUP_SYSTEM,
2976 BTRFS_BLOCK_GROUP_METADATA,
2977 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2978 int num_types = 4;
2979 int i;
2980 int c;
2981 int num_tolerated_disk_barrier_failures =
2982 (int)fs_info->fs_devices->num_devices;
2983
2984 for (i = 0; i < num_types; i++) {
2985 struct btrfs_space_info *tmp;
2986
2987 sinfo = NULL;
2988 rcu_read_lock();
2989 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
2990 if (tmp->flags == types[i]) {
2991 sinfo = tmp;
2992 break;
2993 }
2994 }
2995 rcu_read_unlock();
2996
2997 if (!sinfo)
2998 continue;
2999
3000 down_read(&sinfo->groups_sem);
3001 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3002 if (!list_empty(&sinfo->block_groups[c])) {
3003 u64 flags;
3004
3005 btrfs_get_block_group_info(
3006 &sinfo->block_groups[c], &space);
3007 if (space.total_bytes == 0 ||
3008 space.used_bytes == 0)
3009 continue;
3010 flags = space.flags;
3011 /*
3012 * return
3013 * 0: if dup, single or RAID0 is configured for
3014 * any of metadata, system or data, else
3015 * 1: if RAID5 is configured, or if RAID1 or
3016 * RAID10 is configured and only two mirrors
3017 * are used, else
3018 * 2: if RAID6 is configured, else
3019 * num_mirrors - 1: if RAID1 or RAID10 is
3020 * configured and more than
3021 * 2 mirrors are used.
3022 */
3023 if (num_tolerated_disk_barrier_failures > 0 &&
3024 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3025 BTRFS_BLOCK_GROUP_RAID0)) ||
3026 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3027 == 0)))
3028 num_tolerated_disk_barrier_failures = 0;
3029 else if (num_tolerated_disk_barrier_failures > 1
3030 &&
3031 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3032 BTRFS_BLOCK_GROUP_RAID10)))
3033 num_tolerated_disk_barrier_failures = 1;
3034 }
3035 }
3036 up_read(&sinfo->groups_sem);
3037 }
3038
3039 return num_tolerated_disk_barrier_failures;
3040}
3041
2957int write_all_supers(struct btrfs_root *root, int max_mirrors) 3042int write_all_supers(struct btrfs_root *root, int max_mirrors)
2958{ 3043{
2959 struct list_head *head; 3044 struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2976 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3061 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2977 head = &root->fs_info->fs_devices->devices; 3062 head = &root->fs_info->fs_devices->devices;
2978 3063
2979 if (do_barriers) 3064 if (do_barriers) {
2980 barrier_all_devices(root->fs_info); 3065 ret = barrier_all_devices(root->fs_info);
3066 if (ret) {
3067 mutex_unlock(
3068 &root->fs_info->fs_devices->device_list_mutex);
3069 btrfs_error(root->fs_info, ret,
3070 "errors while submitting device barriers.");
3071 return ret;
3072 }
3073 }
2981 3074
2982 list_for_each_entry_rcu(dev, head, dev_list) { 3075 list_for_each_entry_rcu(dev, head, dev_list) {
2983 if (!dev->bdev) { 3076 if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
3211 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3304 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3212 (unsigned long long)fs_info->delalloc_bytes); 3305 (unsigned long long)fs_info->delalloc_bytes);
3213 } 3306 }
3214 if (fs_info->total_ref_cache_size) {
3215 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3216 (unsigned long long)fs_info->total_ref_cache_size);
3217 }
3218 3307
3219 free_extent_buffer(fs_info->extent_root->node); 3308 free_extent_buffer(fs_info->extent_root->node);
3220 free_extent_buffer(fs_info->extent_root->commit_root); 3309 free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3360 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3449 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3361} 3450}
3362 3451
3363int btree_lock_page_hook(struct page *page, void *data,
3364 void (*flush_fn)(void *))
3365{
3366 struct inode *inode = page->mapping->host;
3367 struct btrfs_root *root = BTRFS_I(inode)->root;
3368 struct extent_buffer *eb;
3369
3370 /*
3371 * We culled this eb but the page is still hanging out on the mapping,
3372 * carry on.
3373 */
3374 if (!PagePrivate(page))
3375 goto out;
3376
3377 eb = (struct extent_buffer *)page->private;
3378 if (!eb) {
3379 WARN_ON(1);
3380 goto out;
3381 }
3382 if (page != eb->pages[0])
3383 goto out;
3384
3385 if (!btrfs_try_tree_write_lock(eb)) {
3386 flush_fn(data);
3387 btrfs_tree_lock(eb);
3388 }
3389 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3390
3391 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3392 spin_lock(&root->fs_info->delalloc_lock);
3393 if (root->fs_info->dirty_metadata_bytes >= eb->len)
3394 root->fs_info->dirty_metadata_bytes -= eb->len;
3395 else
3396 WARN_ON(1);
3397 spin_unlock(&root->fs_info->delalloc_lock);
3398 }
3399
3400 btrfs_tree_unlock(eb);
3401out:
3402 if (!trylock_page(page)) {
3403 flush_fn(data);
3404 lock_page(page);
3405 }
3406 return 0;
3407}
3408
3409static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3452static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3410 int read_only) 3453 int read_only)
3411{ 3454{
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3608 3651
3609 while (1) { 3652 while (1) {
3610 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 3653 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3611 mark); 3654 mark, NULL);
3612 if (ret) 3655 if (ret)
3613 break; 3656 break;
3614 3657
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3663again: 3706again:
3664 while (1) { 3707 while (1) {
3665 ret = find_first_extent_bit(unpin, 0, &start, &end, 3708 ret = find_first_extent_bit(unpin, 0, &start, &end,
3666 EXTENT_DIRTY); 3709 EXTENT_DIRTY, NULL);
3667 if (ret) 3710 if (ret)
3668 break; 3711 break;
3669 3712
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3800} 3843}
3801 3844
3802static struct extent_io_ops btree_extent_io_ops = { 3845static struct extent_io_ops btree_extent_io_ops = {
3803 .write_cache_pages_lock_hook = btree_lock_page_hook,
3804 .readpage_end_io_hook = btree_readpage_end_io_hook, 3846 .readpage_end_io_hook = btree_readpage_end_io_hook,
3805 .readpage_io_failed_hook = btree_io_failed_hook, 3847 .readpage_io_failed_hook = btree_io_failed_hook,
3806 .submit_bio_hook = btree_submit_bio_hook, 3848 .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
95 u64 objectid); 95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data, 96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *)); 97 void (*flush_fn)(void *));
98int btrfs_calc_num_tolerated_disk_barrier_failures(
99 struct btrfs_fs_info *fs_info);
98 100
99#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
100void btrfs_init_lockdep(void); 102void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..3d3e2c17d8d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 u64 flags, struct btrfs_disk_key *key, 94 u64 flags, struct btrfs_disk_key *key,
95 int level, struct btrfs_key *ins); 95 int level, struct btrfs_key *ins);
96static int do_chunk_alloc(struct btrfs_trans_handle *trans, 96static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 struct btrfs_root *extent_root, u64 alloc_bytes, 97 struct btrfs_root *extent_root, u64 flags,
98 u64 flags, int force); 98 int force);
99static int find_next_key(struct btrfs_path *path, int level, 99static int find_next_key(struct btrfs_path *path, int level,
100 struct btrfs_key *key); 100 struct btrfs_key *key);
101static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 101static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
312 while (start < end) { 312 while (start < end) {
313 ret = find_first_extent_bit(info->pinned_extents, start, 313 ret = find_first_extent_bit(info->pinned_extents, start,
314 &extent_start, &extent_end, 314 &extent_start, &extent_end,
315 EXTENT_DIRTY | EXTENT_UPTODATE); 315 EXTENT_DIRTY | EXTENT_UPTODATE,
316 NULL);
316 if (ret) 317 if (ret)
317 break; 318 break;
318 319
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2361 } 2362 }
2362 2363
2363next: 2364next:
2364 do_chunk_alloc(trans, fs_info->extent_root,
2365 2 * 1024 * 1024,
2366 btrfs_get_alloc_profile(root, 0),
2367 CHUNK_ALLOC_NO_FORCE);
2368 cond_resched(); 2365 cond_resched();
2369 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2370 } 2367 }
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2478 if (root == root->fs_info->extent_root) 2475 if (root == root->fs_info->extent_root)
2479 root = root->fs_info->tree_root; 2476 root = root->fs_info->tree_root;
2480 2477
2481 do_chunk_alloc(trans, root->fs_info->extent_root,
2482 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2483 CHUNK_ALLOC_NO_FORCE);
2484
2485 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2478 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2486 2479
2487 delayed_refs = &trans->transaction->delayed_refs; 2480 delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
2551 } 2544 }
2552 2545
2553 if (run_all) { 2546 if (run_all) {
2547 if (!list_empty(&trans->new_bgs)) {
2548 spin_unlock(&delayed_refs->lock);
2549 btrfs_create_pending_block_groups(trans, root);
2550 spin_lock(&delayed_refs->lock);
2551 }
2552
2554 node = rb_first(&delayed_refs->root); 2553 node = rb_first(&delayed_refs->root);
2555 if (!node) 2554 if (!node)
2556 goto out; 2555 goto out;
@@ -3406,7 +3405,6 @@ alloc:
3406 return PTR_ERR(trans); 3405 return PTR_ERR(trans);
3407 3406
3408 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3407 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3409 bytes + 2 * 1024 * 1024,
3410 alloc_target, 3408 alloc_target,
3411 CHUNK_ALLOC_NO_FORCE); 3409 CHUNK_ALLOC_NO_FORCE);
3412 btrfs_end_transaction(trans, root); 3410 btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3488} 3486}
3489 3487
3490static int should_alloc_chunk(struct btrfs_root *root, 3488static int should_alloc_chunk(struct btrfs_root *root,
3491 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3489 struct btrfs_space_info *sinfo, int force)
3492 int force)
3493{ 3490{
3494 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3491 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3495 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3492 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3504 * and purposes it's used space. Don't worry about locking the 3501 * and purposes it's used space. Don't worry about locking the
3505 * global_rsv, it doesn't change except when the transaction commits. 3502 * global_rsv, it doesn't change except when the transaction commits.
3506 */ 3503 */
3507 num_allocated += global_rsv->size; 3504 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505 num_allocated += global_rsv->size;
3508 3506
3509 /* 3507 /*
3510 * in limited mode, we want to have some free space up to 3508 * in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3518 if (num_bytes - num_allocated < thresh) 3516 if (num_bytes - num_allocated < thresh)
3519 return 1; 3517 return 1;
3520 } 3518 }
3521 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3522 3519
3523 /* 256MB or 2% of the FS */ 3520 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3524 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3525 /* system chunks need a much small threshold */
3526 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3527 thresh = 32 * 1024 * 1024;
3528
3529 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3530 return 0; 3521 return 0;
3531 return 1; 3522 return 1;
3532} 3523}
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
3576} 3567}
3577 3568
3578static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3569static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3579 struct btrfs_root *extent_root, u64 alloc_bytes, 3570 struct btrfs_root *extent_root, u64 flags, int force)
3580 u64 flags, int force)
3581{ 3571{
3582 struct btrfs_space_info *space_info; 3572 struct btrfs_space_info *space_info;
3583 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3573 struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
3601 return 0; 3591 return 0;
3602 } 3592 }
3603 3593
3604 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3594 if (!should_alloc_chunk(extent_root, space_info, force)) {
3605 spin_unlock(&space_info->lock); 3595 spin_unlock(&space_info->lock);
3606 return 0; 3596 return 0;
3607 } else if (space_info->chunk_alloc) { 3597 } else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
3669 return ret; 3659 return ret;
3670} 3660}
3671 3661
3662static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush)
3665{
3666 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail;
3668 u64 used;
3669
3670 used = space_info->bytes_used + space_info->bytes_reserved +
3671 space_info->bytes_pinned + space_info->bytes_readonly +
3672 space_info->bytes_may_use;
3673
3674 spin_lock(&root->fs_info->free_chunk_lock);
3675 avail = root->fs_info->free_chunk_space;
3676 spin_unlock(&root->fs_info->free_chunk_lock);
3677
3678 /*
3679 * If we have dup, raid1 or raid10 then only half of the free
3680 * space is actually useable.
3681 */
3682 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3683 BTRFS_BLOCK_GROUP_RAID1 |
3684 BTRFS_BLOCK_GROUP_RAID10))
3685 avail >>= 1;
3686
3687 /*
3688 * If we aren't flushing don't let us overcommit too much, say
3689 * 1/8th of the space. If we can flush, let it overcommit up to
3690 * 1/2 of the space.
3691 */
3692 if (flush)
3693 avail >>= 3;
3694 else
3695 avail >>= 1;
3696
3697 if (used + bytes < space_info->total_bytes + avail)
3698 return 1;
3699 return 0;
3700}
3701
3672/* 3702/*
3673 * shrink metadata reservation for delalloc 3703 * shrink metadata reservation for delalloc
3674 */ 3704 */
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3693 if (delalloc_bytes == 0) { 3723 if (delalloc_bytes == 0) {
3694 if (trans) 3724 if (trans)
3695 return; 3725 return;
3696 btrfs_wait_ordered_extents(root, 0, 0); 3726 btrfs_wait_ordered_extents(root, 0);
3697 return; 3727 return;
3698 } 3728 }
3699 3729
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3703 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3704 WB_REASON_FS_FREE_SPACE); 3734 WB_REASON_FS_FREE_SPACE);
3705 3735
3736 /*
3737 * We need to wait for the async pages to actually start before
3738 * we do anything.
3739 */
3740 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages));
3742
3706 spin_lock(&space_info->lock); 3743 spin_lock(&space_info->lock);
3707 if (space_info->bytes_used + space_info->bytes_reserved + 3744 if (can_overcommit(root, space_info, orig, !trans)) {
3708 space_info->bytes_pinned + space_info->bytes_readonly +
3709 space_info->bytes_may_use + orig <=
3710 space_info->total_bytes) {
3711 spin_unlock(&space_info->lock); 3745 spin_unlock(&space_info->lock);
3712 break; 3746 break;
3713 } 3747 }
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3715 3749
3716 loops++; 3750 loops++;
3717 if (wait_ordered && !trans) { 3751 if (wait_ordered && !trans) {
3718 btrfs_wait_ordered_extents(root, 0, 0); 3752 btrfs_wait_ordered_extents(root, 0);
3719 } else { 3753 } else {
3720 time_left = schedule_timeout_killable(1); 3754 time_left = schedule_timeout_killable(1);
3721 if (time_left) 3755 if (time_left)
@@ -3784,11 +3818,12 @@ commit:
3784} 3818}
3785 3819
3786enum flush_state { 3820enum flush_state {
3787 FLUSH_DELALLOC = 1, 3821 FLUSH_DELAYED_ITEMS_NR = 1,
3788 FLUSH_DELALLOC_WAIT = 2, 3822 FLUSH_DELAYED_ITEMS = 2,
3789 FLUSH_DELAYED_ITEMS_NR = 3, 3823 FLUSH_DELALLOC = 3,
3790 FLUSH_DELAYED_ITEMS = 4, 3824 FLUSH_DELALLOC_WAIT = 4,
3791 COMMIT_TRANS = 5, 3825 ALLOC_CHUNK = 5,
3826 COMMIT_TRANS = 6,
3792}; 3827};
3793 3828
3794static int flush_space(struct btrfs_root *root, 3829static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
3800 int ret = 0; 3835 int ret = 0;
3801 3836
3802 switch (state) { 3837 switch (state) {
3803 case FLUSH_DELALLOC:
3804 case FLUSH_DELALLOC_WAIT:
3805 shrink_delalloc(root, num_bytes, orig_bytes,
3806 state == FLUSH_DELALLOC_WAIT);
3807 break;
3808 case FLUSH_DELAYED_ITEMS_NR: 3838 case FLUSH_DELAYED_ITEMS_NR:
3809 case FLUSH_DELAYED_ITEMS: 3839 case FLUSH_DELAYED_ITEMS:
3810 if (state == FLUSH_DELAYED_ITEMS_NR) { 3840 if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
3825 ret = btrfs_run_delayed_items_nr(trans, root, nr); 3855 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3826 btrfs_end_transaction(trans, root); 3856 btrfs_end_transaction(trans, root);
3827 break; 3857 break;
3858 case FLUSH_DELALLOC:
3859 case FLUSH_DELALLOC_WAIT:
3860 shrink_delalloc(root, num_bytes, orig_bytes,
3861 state == FLUSH_DELALLOC_WAIT);
3862 break;
3863 case ALLOC_CHUNK:
3864 trans = btrfs_join_transaction(root);
3865 if (IS_ERR(trans)) {
3866 ret = PTR_ERR(trans);
3867 break;
3868 }
3869 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870 btrfs_get_alloc_profile(root, 0),
3871 CHUNK_ALLOC_NO_FORCE);
3872 btrfs_end_transaction(trans, root);
3873 if (ret == -ENOSPC)
3874 ret = 0;
3875 break;
3828 case COMMIT_TRANS: 3876 case COMMIT_TRANS:
3829 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 3877 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3830 break; 3878 break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3856 struct btrfs_space_info *space_info = block_rsv->space_info; 3904 struct btrfs_space_info *space_info = block_rsv->space_info;
3857 u64 used; 3905 u64 used;
3858 u64 num_bytes = orig_bytes; 3906 u64 num_bytes = orig_bytes;
3859 int flush_state = FLUSH_DELALLOC; 3907 int flush_state = FLUSH_DELAYED_ITEMS_NR;
3860 int ret = 0; 3908 int ret = 0;
3861 bool flushing = false; 3909 bool flushing = false;
3862 bool committed = false;
3863 3910
3864again: 3911again:
3865 ret = 0; 3912 ret = 0;
@@ -3922,57 +3969,12 @@ again:
3922 (orig_bytes * 2); 3969 (orig_bytes * 2);
3923 } 3970 }
3924 3971
3925 if (ret) { 3972 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3926 u64 profile = btrfs_get_alloc_profile(root, 0); 3973 space_info->bytes_may_use += orig_bytes;
3927 u64 avail; 3974 trace_btrfs_space_reservation(root->fs_info, "space_info",
3928 3975 space_info->flags, orig_bytes,
3929 /* 3976 1);
3930 * If we have a lot of space that's pinned, don't bother doing 3977 ret = 0;
3931 * the overcommit dance yet and just commit the transaction.
3932 */
3933 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3934 do_div(avail, 10);
3935 if (space_info->bytes_pinned >= avail && flush && !committed) {
3936 space_info->flush = 1;
3937 flushing = true;
3938 spin_unlock(&space_info->lock);
3939 ret = may_commit_transaction(root, space_info,
3940 orig_bytes, 1);
3941 if (ret)
3942 goto out;
3943 committed = true;
3944 goto again;
3945 }
3946
3947 spin_lock(&root->fs_info->free_chunk_lock);
3948 avail = root->fs_info->free_chunk_space;
3949
3950 /*
3951 * If we have dup, raid1 or raid10 then only half of the free
3952 * space is actually useable.
3953 */
3954 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3955 BTRFS_BLOCK_GROUP_RAID1 |
3956 BTRFS_BLOCK_GROUP_RAID10))
3957 avail >>= 1;
3958
3959 /*
3960 * If we aren't flushing don't let us overcommit too much, say
3961 * 1/8th of the space. If we can flush, let it overcommit up to
3962 * 1/2 of the space.
3963 */
3964 if (flush)
3965 avail >>= 3;
3966 else
3967 avail >>= 1;
3968 spin_unlock(&root->fs_info->free_chunk_lock);
3969
3970 if (used + num_bytes < space_info->total_bytes + avail) {
3971 space_info->bytes_may_use += orig_bytes;
3972 trace_btrfs_space_reservation(root->fs_info,
3973 "space_info", space_info->flags, orig_bytes, 1);
3974 ret = 0;
3975 }
3976 } 3978 }
3977 3979
3978 /* 3980 /*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4114 return 0; 4116 return 0;
4115} 4117}
4116 4118
4117void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) 4119void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4118{ 4120{
4119 memset(rsv, 0, sizeof(*rsv)); 4121 memset(rsv, 0, sizeof(*rsv));
4120 spin_lock_init(&rsv->lock); 4122 spin_lock_init(&rsv->lock);
4123 rsv->type = type;
4121} 4124}
4122 4125
4123struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 4126struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4127 unsigned short type)
4124{ 4128{
4125 struct btrfs_block_rsv *block_rsv; 4129 struct btrfs_block_rsv *block_rsv;
4126 struct btrfs_fs_info *fs_info = root->fs_info; 4130 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4129 if (!block_rsv) 4133 if (!block_rsv)
4130 return NULL; 4134 return NULL;
4131 4135
4132 btrfs_init_block_rsv(block_rsv); 4136 btrfs_init_block_rsv(block_rsv, type);
4133 block_rsv->space_info = __find_space_info(fs_info, 4137 block_rsv->space_info = __find_space_info(fs_info,
4134 BTRFS_BLOCK_GROUP_METADATA); 4138 BTRFS_BLOCK_GROUP_METADATA);
4135 return block_rsv; 4139 return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4138void btrfs_free_block_rsv(struct btrfs_root *root, 4142void btrfs_free_block_rsv(struct btrfs_root *root,
4139 struct btrfs_block_rsv *rsv) 4143 struct btrfs_block_rsv *rsv)
4140{ 4144{
4145 if (!rsv)
4146 return;
4141 btrfs_block_rsv_release(root, rsv, (u64)-1); 4147 btrfs_block_rsv_release(root, rsv, (u64)-1);
4142 kfree(rsv); 4148 kfree(rsv);
4143} 4149}
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4416 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4422 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4417 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4423 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4418 /* 4424 /*
4419 * two for root back/forward refs, two for directory entries 4425 * two for root back/forward refs, two for directory entries,
4420 * and one for root of the snapshot. 4426 * one for root of the snapshot and one for parent inode.
4421 */ 4427 */
4422 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 4428 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4423 dst_rsv->space_info = src_rsv->space_info; 4429 dst_rsv->space_info = src_rsv->space_info;
4424 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4430 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4425} 4431}
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5018 5024
5019 while (1) { 5025 while (1) {
5020 ret = find_first_extent_bit(unpin, 0, &start, &end, 5026 ret = find_first_extent_bit(unpin, 0, &start, &end,
5021 EXTENT_DIRTY); 5027 EXTENT_DIRTY, NULL);
5022 if (ret) 5028 if (ret)
5023 break; 5029 break;
5024 5030
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5096 ret = remove_extent_backref(trans, extent_root, path, 5102 ret = remove_extent_backref(trans, extent_root, path,
5097 NULL, refs_to_drop, 5103 NULL, refs_to_drop,
5098 is_data); 5104 is_data);
5099 if (ret) 5105 if (ret) {
5100 goto abort; 5106 btrfs_abort_transaction(trans, extent_root, ret);
5107 goto out;
5108 }
5101 btrfs_release_path(path); 5109 btrfs_release_path(path);
5102 path->leave_spinning = 1; 5110 path->leave_spinning = 1;
5103 5111
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5115 btrfs_print_leaf(extent_root, 5123 btrfs_print_leaf(extent_root,
5116 path->nodes[0]); 5124 path->nodes[0]);
5117 } 5125 }
5118 if (ret < 0) 5126 if (ret < 0) {
5119 goto abort; 5127 btrfs_abort_transaction(trans, extent_root, ret);
5128 goto out;
5129 }
5120 extent_slot = path->slots[0]; 5130 extent_slot = path->slots[0];
5121 } 5131 }
5122 } else if (ret == -ENOENT) { 5132 } else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5130 (unsigned long long)owner_objectid, 5140 (unsigned long long)owner_objectid,
5131 (unsigned long long)owner_offset); 5141 (unsigned long long)owner_offset);
5132 } else { 5142 } else {
5133 goto abort; 5143 btrfs_abort_transaction(trans, extent_root, ret);
5144 goto out;
5134 } 5145 }
5135 5146
5136 leaf = path->nodes[0]; 5147 leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5140 BUG_ON(found_extent || extent_slot != path->slots[0]); 5151 BUG_ON(found_extent || extent_slot != path->slots[0]);
5141 ret = convert_extent_item_v0(trans, extent_root, path, 5152 ret = convert_extent_item_v0(trans, extent_root, path,
5142 owner_objectid, 0); 5153 owner_objectid, 0);
5143 if (ret < 0) 5154 if (ret < 0) {
5144 goto abort; 5155 btrfs_abort_transaction(trans, extent_root, ret);
5156 goto out;
5157 }
5145 5158
5146 btrfs_release_path(path); 5159 btrfs_release_path(path);
5147 path->leave_spinning = 1; 5160 path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5158 (unsigned long long)bytenr); 5171 (unsigned long long)bytenr);
5159 btrfs_print_leaf(extent_root, path->nodes[0]); 5172 btrfs_print_leaf(extent_root, path->nodes[0]);
5160 } 5173 }
5161 if (ret < 0) 5174 if (ret < 0) {
5162 goto abort; 5175 btrfs_abort_transaction(trans, extent_root, ret);
5176 goto out;
5177 }
5178
5163 extent_slot = path->slots[0]; 5179 extent_slot = path->slots[0];
5164 leaf = path->nodes[0]; 5180 leaf = path->nodes[0];
5165 item_size = btrfs_item_size_nr(leaf, extent_slot); 5181 item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5196 ret = remove_extent_backref(trans, extent_root, path, 5212 ret = remove_extent_backref(trans, extent_root, path,
5197 iref, refs_to_drop, 5213 iref, refs_to_drop,
5198 is_data); 5214 is_data);
5199 if (ret) 5215 if (ret) {
5200 goto abort; 5216 btrfs_abort_transaction(trans, extent_root, ret);
5217 goto out;
5218 }
5201 } 5219 }
5202 } else { 5220 } else {
5203 if (found_extent) { 5221 if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5214 5232
5215 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5233 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5216 num_to_del); 5234 num_to_del);
5217 if (ret) 5235 if (ret) {
5218 goto abort; 5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238 }
5219 btrfs_release_path(path); 5239 btrfs_release_path(path);
5220 5240
5221 if (is_data) { 5241 if (is_data) {
5222 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5242 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5223 if (ret) 5243 if (ret) {
5224 goto abort; 5244 btrfs_abort_transaction(trans, extent_root, ret);
5245 goto out;
5246 }
5225 } 5247 }
5226 5248
5227 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5249 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5228 if (ret) 5250 if (ret) {
5229 goto abort; 5251 btrfs_abort_transaction(trans, extent_root, ret);
5252 goto out;
5253 }
5230 } 5254 }
5231out: 5255out:
5232 btrfs_free_path(path); 5256 btrfs_free_path(path);
5233 return ret; 5257 return ret;
5234
5235abort:
5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238} 5258}
5239 5259
5240/* 5260/*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5497 struct btrfs_block_group_cache *used_block_group; 5517 struct btrfs_block_group_cache *used_block_group;
5498 u64 search_start = 0; 5518 u64 search_start = 0;
5499 int empty_cluster = 2 * 1024 * 1024; 5519 int empty_cluster = 2 * 1024 * 1024;
5500 int allowed_chunk_alloc = 0;
5501 int done_chunk_alloc = 0;
5502 struct btrfs_space_info *space_info; 5520 struct btrfs_space_info *space_info;
5503 int loop = 0; 5521 int loop = 0;
5504 int index = 0; 5522 int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5530 if (btrfs_mixed_space_info(space_info)) 5548 if (btrfs_mixed_space_info(space_info))
5531 use_cluster = false; 5549 use_cluster = false;
5532 5550
5533 if (orig_root->ref_cows || empty_size)
5534 allowed_chunk_alloc = 1;
5535
5536 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5551 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5537 last_ptr = &root->fs_info->meta_alloc_cluster; 5552 last_ptr = &root->fs_info->meta_alloc_cluster;
5538 if (!btrfs_test_opt(root, SSD)) 5553 if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
5806 5821
5807 trace_btrfs_reserve_extent(orig_root, block_group, 5822 trace_btrfs_reserve_extent(orig_root, block_group,
5808 search_start, num_bytes); 5823 search_start, num_bytes);
5809 if (offset < search_start)
5810 btrfs_add_free_space(used_block_group, offset,
5811 search_start - offset);
5812 BUG_ON(offset > search_start);
5813 if (used_block_group != block_group) 5824 if (used_block_group != block_group)
5814 btrfs_put_block_group(used_block_group); 5825 btrfs_put_block_group(used_block_group);
5815 btrfs_put_block_group(block_group); 5826 btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
5842 index = 0; 5853 index = 0;
5843 loop++; 5854 loop++;
5844 if (loop == LOOP_ALLOC_CHUNK) { 5855 if (loop == LOOP_ALLOC_CHUNK) {
5845 if (allowed_chunk_alloc) { 5856 ret = do_chunk_alloc(trans, root, data,
5846 ret = do_chunk_alloc(trans, root, num_bytes + 5857 CHUNK_ALLOC_FORCE);
5847 2 * 1024 * 1024, data, 5858 /*
5848 CHUNK_ALLOC_LIMITED); 5859 * Do not bail out on ENOSPC since we
5849 /* 5860 * can do more things.
5850 * Do not bail out on ENOSPC since we 5861 */
5851 * can do more things. 5862 if (ret < 0 && ret != -ENOSPC) {
5852 */ 5863 btrfs_abort_transaction(trans,
5853 if (ret < 0 && ret != -ENOSPC) { 5864 root, ret);
5854 btrfs_abort_transaction(trans, 5865 goto out;
5855 root, ret);
5856 goto out;
5857 }
5858 allowed_chunk_alloc = 0;
5859 if (ret == 1)
5860 done_chunk_alloc = 1;
5861 } else if (!done_chunk_alloc &&
5862 space_info->force_alloc ==
5863 CHUNK_ALLOC_NO_FORCE) {
5864 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5865 } 5866 }
5866
5867 /*
5868 * We didn't allocate a chunk, go ahead and drop the
5869 * empty size and loop again.
5870 */
5871 if (!done_chunk_alloc)
5872 loop = LOOP_NO_EMPTY_SIZE;
5873 } 5867 }
5874 5868
5875 if (loop == LOOP_NO_EMPTY_SIZE) { 5869 if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5944 5938
5945 data = btrfs_get_alloc_profile(root, data); 5939 data = btrfs_get_alloc_profile(root, data);
5946again: 5940again:
5947 /*
5948 * the only place that sets empty_size is btrfs_realloc_node, which
5949 * is not called recursively on allocations
5950 */
5951 if (empty_size || root->ref_cows) {
5952 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5953 num_bytes + 2 * 1024 * 1024, data,
5954 CHUNK_ALLOC_NO_FORCE);
5955 if (ret < 0 && ret != -ENOSPC) {
5956 btrfs_abort_transaction(trans, root, ret);
5957 return ret;
5958 }
5959 }
5960
5961 WARN_ON(num_bytes < root->sectorsize); 5941 WARN_ON(num_bytes < root->sectorsize);
5962 ret = find_free_extent(trans, root, num_bytes, empty_size, 5942 ret = find_free_extent(trans, root, num_bytes, empty_size,
5963 hint_byte, ins, data); 5943 hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
5967 num_bytes = num_bytes >> 1; 5947 num_bytes = num_bytes >> 1;
5968 num_bytes = num_bytes & ~(root->sectorsize - 1); 5948 num_bytes = num_bytes & ~(root->sectorsize - 1);
5969 num_bytes = max(num_bytes, min_alloc_size); 5949 num_bytes = max(num_bytes, min_alloc_size);
5970 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5971 num_bytes, data, CHUNK_ALLOC_FORCE);
5972 if (ret < 0 && ret != -ENOSPC) {
5973 btrfs_abort_transaction(trans, root, ret);
5974 return ret;
5975 }
5976 if (num_bytes == min_alloc_size) 5950 if (num_bytes == min_alloc_size)
5977 final_tried = true; 5951 final_tried = true;
5978 goto again; 5952 goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6314 ret = block_rsv_use_bytes(block_rsv, blocksize); 6288 ret = block_rsv_use_bytes(block_rsv, blocksize);
6315 if (!ret) 6289 if (!ret)
6316 return block_rsv; 6290 return block_rsv;
6317 if (ret) { 6291 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6292 static DEFINE_RATELIMIT_STATE(_rs,
6319 DEFAULT_RATELIMIT_INTERVAL, 6293 DEFAULT_RATELIMIT_INTERVAL,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6294 /*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7279 7253
7280 alloc_flags = update_block_group_flags(root, cache->flags); 7254 alloc_flags = update_block_group_flags(root, cache->flags);
7281 if (alloc_flags != cache->flags) { 7255 if (alloc_flags != cache->flags) {
7282 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7256 ret = do_chunk_alloc(trans, root, alloc_flags,
7283 CHUNK_ALLOC_FORCE); 7257 CHUNK_ALLOC_FORCE);
7284 if (ret < 0) 7258 if (ret < 0)
7285 goto out; 7259 goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7289 if (!ret) 7263 if (!ret)
7290 goto out; 7264 goto out;
7291 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7265 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7292 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7266 ret = do_chunk_alloc(trans, root, alloc_flags,
7293 CHUNK_ALLOC_FORCE); 7267 CHUNK_ALLOC_FORCE);
7294 if (ret < 0) 7268 if (ret < 0)
7295 goto out; 7269 goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7303 struct btrfs_root *root, u64 type) 7277 struct btrfs_root *root, u64 type)
7304{ 7278{
7305 u64 alloc_flags = get_alloc_profile(root, type); 7279 u64 alloc_flags = get_alloc_profile(root, type);
7306 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7280 return do_chunk_alloc(trans, root, alloc_flags,
7307 CHUNK_ALLOC_FORCE); 7281 CHUNK_ALLOC_FORCE);
7308} 7282}
7309 7283
@@ -7810,6 +7784,34 @@ error:
7810 return ret; 7784 return ret;
7811} 7785}
7812 7786
7787void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7788 struct btrfs_root *root)
7789{
7790 struct btrfs_block_group_cache *block_group, *tmp;
7791 struct btrfs_root *extent_root = root->fs_info->extent_root;
7792 struct btrfs_block_group_item item;
7793 struct btrfs_key key;
7794 int ret = 0;
7795
7796 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7797 new_bg_list) {
7798 list_del_init(&block_group->new_bg_list);
7799
7800 if (ret)
7801 continue;
7802
7803 spin_lock(&block_group->lock);
7804 memcpy(&item, &block_group->item, sizeof(item));
7805 memcpy(&key, &block_group->key, sizeof(key));
7806 spin_unlock(&block_group->lock);
7807
7808 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7809 sizeof(item));
7810 if (ret)
7811 btrfs_abort_transaction(trans, extent_root, ret);
7812 }
7813}
7814
7813int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7815int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7814 struct btrfs_root *root, u64 bytes_used, 7816 struct btrfs_root *root, u64 bytes_used,
7815 u64 type, u64 chunk_objectid, u64 chunk_offset, 7817 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7843 spin_lock_init(&cache->lock); 7845 spin_lock_init(&cache->lock);
7844 INIT_LIST_HEAD(&cache->list); 7846 INIT_LIST_HEAD(&cache->list);
7845 INIT_LIST_HEAD(&cache->cluster_list); 7847 INIT_LIST_HEAD(&cache->cluster_list);
7848 INIT_LIST_HEAD(&cache->new_bg_list);
7846 7849
7847 btrfs_init_free_space_ctl(cache); 7850 btrfs_init_free_space_ctl(cache);
7848 7851
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7874 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7877 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7875 BUG_ON(ret); /* Logic error */ 7878 BUG_ON(ret); /* Logic error */
7876 7879
7877 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7880 list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7878 sizeof(cache->item));
7879 if (ret) {
7880 btrfs_abort_transaction(trans, extent_root, ret);
7881 return ret;
7882 }
7883 7881
7884 set_avail_alloc_bits(extent_root->fs_info, type); 7882 set_avail_alloc_bits(extent_root->fs_info, type);
7885 7883
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c878476bb91..8036d3a84853 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
45 struct bio *bio; 45 struct bio *bio;
46 struct extent_io_tree *tree; 46 struct extent_io_tree *tree;
47 get_extent_t *get_extent; 47 get_extent_t *get_extent;
48 unsigned long bio_flags;
48 49
49 /* tells writepage not to lock the state bits for this range 50 /* tells writepage not to lock the state bits for this range
50 * it still does the unlocking 51 * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
64 65
65int __init extent_io_init(void) 66int __init extent_io_init(void)
66{ 67{
67 extent_state_cache = kmem_cache_create("extent_state", 68 extent_state_cache = kmem_cache_create("btrfs_extent_state",
68 sizeof(struct extent_state), 0, 69 sizeof(struct extent_state), 0,
69 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
70 if (!extent_state_cache) 71 if (!extent_state_cache)
71 return -ENOMEM; 72 return -ENOMEM;
72 73
73 extent_buffer_cache = kmem_cache_create("extent_buffers", 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
74 sizeof(struct extent_buffer), 0, 75 sizeof(struct extent_buffer), 0,
75 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
76 if (!extent_buffer_cache) 77 if (!extent_buffer_cache)
@@ -107,6 +108,12 @@ void extent_io_exit(void)
107 list_del(&eb->leak_list); 108 list_del(&eb->leak_list);
108 kmem_cache_free(extent_buffer_cache, eb); 109 kmem_cache_free(extent_buffer_cache, eb);
109 } 110 }
111
112 /*
113 * Make sure all delayed rcu free are flushed before we
114 * destroy caches.
115 */
116 rcu_barrier();
110 if (extent_state_cache) 117 if (extent_state_cache)
111 kmem_cache_destroy(extent_state_cache); 118 kmem_cache_destroy(extent_state_cache);
112 if (extent_buffer_cache) 119 if (extent_buffer_cache)
@@ -936,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
936 * @end: the end offset in bytes (inclusive) 943 * @end: the end offset in bytes (inclusive)
937 * @bits: the bits to set in this range 944 * @bits: the bits to set in this range
938 * @clear_bits: the bits to clear in this range 945 * @clear_bits: the bits to clear in this range
946 * @cached_state: state that we're going to cache
939 * @mask: the allocation mask 947 * @mask: the allocation mask
940 * 948 *
941 * This will go through and set bits for the given range. If any states exist 949 * This will go through and set bits for the given range. If any states exist
@@ -945,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
945 * boundary bits like LOCK. 953 * boundary bits like LOCK.
946 */ 954 */
947int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 955int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
948 int bits, int clear_bits, gfp_t mask) 956 int bits, int clear_bits,
957 struct extent_state **cached_state, gfp_t mask)
949{ 958{
950 struct extent_state *state; 959 struct extent_state *state;
951 struct extent_state *prealloc = NULL; 960 struct extent_state *prealloc = NULL;
@@ -962,6 +971,15 @@ again:
962 } 971 }
963 972
964 spin_lock(&tree->lock); 973 spin_lock(&tree->lock);
974 if (cached_state && *cached_state) {
975 state = *cached_state;
976 if (state->start <= start && state->end > start &&
977 state->tree) {
978 node = &state->rb_node;
979 goto hit_next;
980 }
981 }
982
965 /* 983 /*
966 * this search will find all the extents that end after 984 * this search will find all the extents that end after
967 * our range starts. 985 * our range starts.
@@ -992,6 +1010,7 @@ hit_next:
992 */ 1010 */
993 if (state->start == start && state->end <= end) { 1011 if (state->start == start && state->end <= end) {
994 set_state_bits(tree, state, &bits); 1012 set_state_bits(tree, state, &bits);
1013 cache_state(state, cached_state);
995 state = clear_state_bit(tree, state, &clear_bits, 0); 1014 state = clear_state_bit(tree, state, &clear_bits, 0);
996 if (last_end == (u64)-1) 1015 if (last_end == (u64)-1)
997 goto out; 1016 goto out;
@@ -1032,6 +1051,7 @@ hit_next:
1032 goto out; 1051 goto out;
1033 if (state->end <= end) { 1052 if (state->end <= end) {
1034 set_state_bits(tree, state, &bits); 1053 set_state_bits(tree, state, &bits);
1054 cache_state(state, cached_state);
1035 state = clear_state_bit(tree, state, &clear_bits, 0); 1055 state = clear_state_bit(tree, state, &clear_bits, 0);
1036 if (last_end == (u64)-1) 1056 if (last_end == (u64)-1)
1037 goto out; 1057 goto out;
@@ -1070,6 +1090,7 @@ hit_next:
1070 &bits); 1090 &bits);
1071 if (err) 1091 if (err)
1072 extent_io_tree_panic(tree, err); 1092 extent_io_tree_panic(tree, err);
1093 cache_state(prealloc, cached_state);
1073 prealloc = NULL; 1094 prealloc = NULL;
1074 start = this_end + 1; 1095 start = this_end + 1;
1075 goto search_again; 1096 goto search_again;
@@ -1092,6 +1113,7 @@ hit_next:
1092 extent_io_tree_panic(tree, err); 1113 extent_io_tree_panic(tree, err);
1093 1114
1094 set_state_bits(tree, prealloc, &bits); 1115 set_state_bits(tree, prealloc, &bits);
1116 cache_state(prealloc, cached_state);
1095 clear_state_bit(tree, prealloc, &clear_bits, 0); 1117 clear_state_bit(tree, prealloc, &clear_bits, 0);
1096 prealloc = NULL; 1118 prealloc = NULL;
1097 goto out; 1119 goto out;
@@ -1144,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1144 NULL, cached_state, mask); 1166 NULL, cached_state, mask);
1145} 1167}
1146 1168
1169int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1170 struct extent_state **cached_state, gfp_t mask)
1171{
1172 return set_extent_bit(tree, start, end,
1173 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1174 NULL, cached_state, mask);
1175}
1176
1147int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1177int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1148 gfp_t mask) 1178 gfp_t mask)
1149{ 1179{
@@ -1288,18 +1318,42 @@ out:
1288 * If nothing was found, 1 is returned. If found something, return 0. 1318 * If nothing was found, 1 is returned. If found something, return 0.
1289 */ 1319 */
1290int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1320int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1291 u64 *start_ret, u64 *end_ret, int bits) 1321 u64 *start_ret, u64 *end_ret, int bits,
1322 struct extent_state **cached_state)
1292{ 1323{
1293 struct extent_state *state; 1324 struct extent_state *state;
1325 struct rb_node *n;
1294 int ret = 1; 1326 int ret = 1;
1295 1327
1296 spin_lock(&tree->lock); 1328 spin_lock(&tree->lock);
1329 if (cached_state && *cached_state) {
1330 state = *cached_state;
1331 if (state->end == start - 1 && state->tree) {
1332 n = rb_next(&state->rb_node);
1333 while (n) {
1334 state = rb_entry(n, struct extent_state,
1335 rb_node);
1336 if (state->state & bits)
1337 goto got_it;
1338 n = rb_next(n);
1339 }
1340 free_extent_state(*cached_state);
1341 *cached_state = NULL;
1342 goto out;
1343 }
1344 free_extent_state(*cached_state);
1345 *cached_state = NULL;
1346 }
1347
1297 state = find_first_extent_bit_state(tree, start, bits); 1348 state = find_first_extent_bit_state(tree, start, bits);
1349got_it:
1298 if (state) { 1350 if (state) {
1351 cache_state(state, cached_state);
1299 *start_ret = state->start; 1352 *start_ret = state->start;
1300 *end_ret = state->end; 1353 *end_ret = state->end;
1301 ret = 0; 1354 ret = 0;
1302 } 1355 }
1356out:
1303 spin_unlock(&tree->lock); 1357 spin_unlock(&tree->lock);
1304 return ret; 1358 return ret;
1305} 1359}
@@ -2062,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2062 } 2116 }
2063 read_unlock(&em_tree->lock); 2117 read_unlock(&em_tree->lock);
2064 2118
2065 if (!em || IS_ERR(em)) { 2119 if (!em) {
2066 kfree(failrec); 2120 kfree(failrec);
2067 return -EIO; 2121 return -EIO;
2068 } 2122 }
@@ -2298,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2298 struct extent_state *cached = NULL; 2352 struct extent_state *cached = NULL;
2299 struct extent_state *state; 2353 struct extent_state *state;
2300 2354
2301 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " 2355 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2302 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, 2356 "mirror=%ld\n", (u64)bio->bi_sector, err,
2303 (long int)bio->bi_bdev); 2357 (long int)bio->bi_bdev);
2304 tree = &BTRFS_I(page->mapping->host)->io_tree; 2358 tree = &BTRFS_I(page->mapping->host)->io_tree;
2305 2359
@@ -2703,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2703 end_bio_extent_readpage, mirror_num, 2757 end_bio_extent_readpage, mirror_num,
2704 *bio_flags, 2758 *bio_flags,
2705 this_bio_flag); 2759 this_bio_flag);
2706 BUG_ON(ret == -ENOMEM); 2760 if (!ret) {
2707 nr++; 2761 nr++;
2708 *bio_flags = this_bio_flag; 2762 *bio_flags = this_bio_flag;
2763 }
2709 } 2764 }
2710 if (ret) 2765 if (ret) {
2711 SetPageError(page); 2766 SetPageError(page);
2767 unlock_extent(tree, cur, cur + iosize - 1);
2768 }
2712 cur = cur + iosize; 2769 cur = cur + iosize;
2713 pg_offset += iosize; 2770 pg_offset += iosize;
2714 } 2771 }
@@ -3155,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
3155 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3212 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3156 u64 offset = eb->start; 3213 u64 offset = eb->start;
3157 unsigned long i, num_pages; 3214 unsigned long i, num_pages;
3215 unsigned long bio_flags = 0;
3158 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3216 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3159 int ret = 0; 3217 int ret = 0;
3160 3218
3161 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3219 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3162 num_pages = num_extent_pages(eb->start, eb->len); 3220 num_pages = num_extent_pages(eb->start, eb->len);
3163 atomic_set(&eb->io_pages, num_pages); 3221 atomic_set(&eb->io_pages, num_pages);
3222 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3223 bio_flags = EXTENT_BIO_TREE_LOG;
3224
3164 for (i = 0; i < num_pages; i++) { 3225 for (i = 0; i < num_pages; i++) {
3165 struct page *p = extent_buffer_page(eb, i); 3226 struct page *p = extent_buffer_page(eb, i);
3166 3227
@@ -3169,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
3169 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3230 ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3170 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3231 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3171 -1, end_bio_extent_buffer_writepage, 3232 -1, end_bio_extent_buffer_writepage,
3172 0, 0, 0); 3233 0, epd->bio_flags, bio_flags);
3234 epd->bio_flags = bio_flags;
3173 if (ret) { 3235 if (ret) {
3174 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3236 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3175 SetPageError(p); 3237 SetPageError(p);
@@ -3204,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3204 .tree = tree, 3266 .tree = tree,
3205 .extent_locked = 0, 3267 .extent_locked = 0,
3206 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3268 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3269 .bio_flags = 0,
3207 }; 3270 };
3208 int ret = 0; 3271 int ret = 0;
3209 int done = 0; 3272 int done = 0;
@@ -3248,19 +3311,34 @@ retry:
3248 break; 3311 break;
3249 } 3312 }
3250 3313
3314 spin_lock(&mapping->private_lock);
3315 if (!PagePrivate(page)) {
3316 spin_unlock(&mapping->private_lock);
3317 continue;
3318 }
3319
3251 eb = (struct extent_buffer *)page->private; 3320 eb = (struct extent_buffer *)page->private;
3321
3322 /*
3323 * Shouldn't happen and normally this would be a BUG_ON
3324 * but no sense in crashing the users box for something
3325 * we can survive anyway.
3326 */
3252 if (!eb) { 3327 if (!eb) {
3328 spin_unlock(&mapping->private_lock);
3253 WARN_ON(1); 3329 WARN_ON(1);
3254 continue; 3330 continue;
3255 } 3331 }
3256 3332
3257 if (eb == prev_eb) 3333 if (eb == prev_eb) {
3334 spin_unlock(&mapping->private_lock);
3258 continue; 3335 continue;
3336 }
3259 3337
3260 if (!atomic_inc_not_zero(&eb->refs)) { 3338 ret = atomic_inc_not_zero(&eb->refs);
3261 WARN_ON(1); 3339 spin_unlock(&mapping->private_lock);
3340 if (!ret)
3262 continue; 3341 continue;
3263 }
3264 3342
3265 prev_eb = eb; 3343 prev_eb = eb;
3266 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3344 ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3451,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
3451 if (epd->sync_io) 3529 if (epd->sync_io)
3452 rw = WRITE_SYNC; 3530 rw = WRITE_SYNC;
3453 3531
3454 ret = submit_one_bio(rw, epd->bio, 0, 0); 3532 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3455 BUG_ON(ret < 0); /* -ENOMEM */ 3533 BUG_ON(ret < 0); /* -ENOMEM */
3456 epd->bio = NULL; 3534 epd->bio = NULL;
3457 } 3535 }
@@ -3474,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3474 .get_extent = get_extent, 3552 .get_extent = get_extent,
3475 .extent_locked = 0, 3553 .extent_locked = 0,
3476 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3554 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3555 .bio_flags = 0,
3477 }; 3556 };
3478 3557
3479 ret = __extent_writepage(page, wbc, &epd); 3558 ret = __extent_writepage(page, wbc, &epd);
@@ -3498,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3498 .get_extent = get_extent, 3577 .get_extent = get_extent,
3499 .extent_locked = 1, 3578 .extent_locked = 1,
3500 .sync_io = mode == WB_SYNC_ALL, 3579 .sync_io = mode == WB_SYNC_ALL,
3580 .bio_flags = 0,
3501 }; 3581 };
3502 struct writeback_control wbc_writepages = { 3582 struct writeback_control wbc_writepages = {
3503 .sync_mode = mode, 3583 .sync_mode = mode,
@@ -3537,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
3537 .get_extent = get_extent, 3617 .get_extent = get_extent,
3538 .extent_locked = 0, 3618 .extent_locked = 0,
3539 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3619 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3620 .bio_flags = 0,
3540 }; 3621 };
3541 3622
3542 ret = extent_write_cache_pages(tree, mapping, wbc, 3623 ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3914,18 +3995,6 @@ out:
3914 return ret; 3995 return ret;
3915} 3996}
3916 3997
3917inline struct page *extent_buffer_page(struct extent_buffer *eb,
3918 unsigned long i)
3919{
3920 return eb->pages[i];
3921}
3922
3923inline unsigned long num_extent_pages(u64 start, u64 len)
3924{
3925 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3926 (start >> PAGE_CACHE_SHIFT);
3927}
3928
3929static void __free_extent_buffer(struct extent_buffer *eb) 3998static void __free_extent_buffer(struct extent_buffer *eb)
3930{ 3999{
3931#if LEAK_DEBUG 4000#if LEAK_DEBUG
@@ -4041,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4041 4110
4042 return eb; 4111 return eb;
4043err: 4112err:
4044 for (i--; i > 0; i--) 4113 for (i--; i >= 0; i--)
4045 __free_page(eb->pages[i]); 4114 __free_page(eb->pages[i]);
4046 __free_extent_buffer(eb); 4115 __free_extent_buffer(eb);
4047 return NULL; 4116 return NULL;
@@ -4186,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4186 4255
4187 for (i = 0; i < num_pages; i++, index++) { 4256 for (i = 0; i < num_pages; i++, index++) {
4188 p = find_or_create_page(mapping, index, GFP_NOFS); 4257 p = find_or_create_page(mapping, index, GFP_NOFS);
4189 if (!p) { 4258 if (!p)
4190 WARN_ON(1);
4191 goto free_eb; 4259 goto free_eb;
4192 }
4193 4260
4194 spin_lock(&mapping->private_lock); 4261 spin_lock(&mapping->private_lock);
4195 if (PagePrivate(p)) { 4262 if (PagePrivate(p)) {
@@ -4332,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4332 4399
4333 /* Should be safe to release our pages at this point */ 4400 /* Should be safe to release our pages at this point */
4334 btrfs_release_extent_buffer_page(eb, 0); 4401 btrfs_release_extent_buffer_page(eb, 0);
4335
4336 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4402 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4337 return 1; 4403 return 1;
4338 } 4404 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..711d12b80028 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
27 * type for this bio 27 * type for this bio
28 */ 28 */
29#define EXTENT_BIO_COMPRESSED 1 29#define EXTENT_BIO_COMPRESSED 1
30#define EXTENT_BIO_TREE_LOG 2
30#define EXTENT_BIO_FLAG_SHIFT 16 31#define EXTENT_BIO_FLAG_SHIFT 16
31 32
32/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
232int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 233int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
233 gfp_t mask); 234 gfp_t mask);
234int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 235int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
235 int bits, int clear_bits, gfp_t mask); 236 int bits, int clear_bits,
237 struct extent_state **cached_state, gfp_t mask);
236int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 238int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
237 struct extent_state **cached_state, gfp_t mask); 239 struct extent_state **cached_state, gfp_t mask);
240int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
241 struct extent_state **cached_state, gfp_t mask);
238int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 242int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
239 u64 *start_ret, u64 *end_ret, int bits); 243 u64 *start_ret, u64 *end_ret, int bits,
244 struct extent_state **cached_state);
240struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 245struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
241 u64 start, int bits); 246 u64 start, int bits);
242int extent_invalidatepage(struct extent_io_tree *tree, 247int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
277int read_extent_buffer_pages(struct extent_io_tree *tree, 282int read_extent_buffer_pages(struct extent_io_tree *tree,
278 struct extent_buffer *eb, u64 start, int wait, 283 struct extent_buffer *eb, u64 start, int wait,
279 get_extent_t *get_extent, int mirror_num); 284 get_extent_t *get_extent, int mirror_num);
280unsigned long num_extent_pages(u64 start, u64 len); 285
281struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); 286static inline unsigned long num_extent_pages(u64 start, u64 len)
287{
288 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
289 (start >> PAGE_CACHE_SHIFT);
290}
291
292static inline struct page *extent_buffer_page(struct extent_buffer *eb,
293 unsigned long i)
294{
295 return eb->pages[i];
296}
282 297
283static inline void extent_buffer_get(struct extent_buffer *eb) 298static inline void extent_buffer_get(struct extent_buffer *eb)
284{ 299{
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..b8cbc8d5c7f7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
11 11
12int __init extent_map_init(void) 12int __init extent_map_init(void)
13{ 13{
14 extent_map_cache = kmem_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("btrfs_extent_map",
15 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
17 if (!extent_map_cache) 17 if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
35void extent_map_tree_init(struct extent_map_tree *tree) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
39} 40}
40 41
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
54 em->in_tree = 0; 55 em->in_tree = 0;
55 em->flags = 0; 56 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 57 em->compress_type = BTRFS_COMPRESS_NONE;
58 em->generation = 0;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
60 INIT_LIST_HEAD(&em->list);
58 return em; 61 return em;
59} 62}
60 63
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
72 WARN_ON(atomic_read(&em->refs) == 0); 75 WARN_ON(atomic_read(&em->refs) == 0);
73 if (atomic_dec_and_test(&em->refs)) { 76 if (atomic_dec_and_test(&em->refs)) {
74 WARN_ON(em->in_tree); 77 WARN_ON(em->in_tree);
78 WARN_ON(!list_empty(&em->list));
75 kmem_cache_free(extent_map_cache, em); 79 kmem_cache_free(extent_map_cache, em);
76 } 80 }
77} 81}
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 em->block_len += merge->block_len; 202 em->block_len += merge->block_len;
199 em->block_start = merge->block_start; 203 em->block_start = merge->block_start;
200 merge->in_tree = 0; 204 merge->in_tree = 0;
205 if (merge->generation > em->generation) {
206 em->mod_start = em->start;
207 em->mod_len = em->len;
208 em->generation = merge->generation;
209 list_move(&em->list, &tree->modified_extents);
210 }
211
212 list_del_init(&merge->list);
201 rb_erase(&merge->rb_node, &tree->map); 213 rb_erase(&merge->rb_node, &tree->map);
202 free_extent_map(merge); 214 free_extent_map(merge);
203 } 215 }
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
211 em->block_len += merge->len; 223 em->block_len += merge->len;
212 rb_erase(&merge->rb_node, &tree->map); 224 rb_erase(&merge->rb_node, &tree->map);
213 merge->in_tree = 0; 225 merge->in_tree = 0;
226 if (merge->generation > em->generation) {
227 em->mod_len = em->len;
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list);
214 free_extent_map(merge); 232 free_extent_map(merge);
215 } 233 }
216} 234}
217 235
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 236/**
237 * unpint_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file
240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 *
244 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know
246 * we need to sync this extent when we call fsync().
247 */
248int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
249 u64 gen)
219{ 250{
220 int ret = 0; 251 int ret = 0;
221 struct extent_map *em; 252 struct extent_map *em;
253 bool prealloc = false;
222 254
223 write_lock(&tree->lock); 255 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len); 256 em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
228 if (!em) 260 if (!em)
229 goto out; 261 goto out;
230 262
263 list_move(&em->list, &tree->modified_extents);
264 em->generation = gen;
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 265 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
266 em->mod_start = em->start;
267 em->mod_len = em->len;
268
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
270 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
272 }
232 273
233 try_merge_map(tree, em); 274 try_merge_map(tree, em);
234 275
276 if (prealloc) {
277 em->mod_start = em->start;
278 em->mod_len = em->len;
279 }
280
235 free_extent_map(em); 281 free_extent_map(em);
236out: 282out:
237 write_unlock(&tree->lock); 283 write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
269 } 315 }
270 atomic_inc(&em->refs); 316 atomic_inc(&em->refs);
271 317
318 em->mod_start = em->start;
319 em->mod_len = em->len;
320
272 try_merge_map(tree, em); 321 try_merge_map(tree, em);
273out: 322out:
274 return ret; 323 return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
358 407
359 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 408 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
360 rb_erase(&em->rb_node, &tree->map); 409 rb_erase(&em->rb_node, &tree->map);
410 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
411 list_del_init(&em->list);
361 em->in_tree = 0; 412 em->in_tree = 0;
362 return ret; 413 return ret;
363} 414}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..679225555f7b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
13#define EXTENT_FLAG_COMPRESSED 1 13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
16 17
17struct extent_map { 18struct extent_map {
18 struct rb_node rb_node; 19 struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
20 /* all of these are in bytes */ 21 /* all of these are in bytes */
21 u64 start; 22 u64 start;
22 u64 len; 23 u64 len;
24 u64 mod_start;
25 u64 mod_len;
23 u64 orig_start; 26 u64 orig_start;
24 u64 block_start; 27 u64 block_start;
25 u64 block_len; 28 u64 block_len;
29 u64 generation;
26 unsigned long flags; 30 unsigned long flags;
27 struct block_device *bdev; 31 struct block_device *bdev;
28 atomic_t refs; 32 atomic_t refs;
29 unsigned int in_tree; 33 unsigned int in_tree;
30 unsigned int compress_type; 34 unsigned int compress_type;
35 struct list_head list;
31}; 36};
32 37
33struct extent_map_tree { 38struct extent_map_tree {
34 struct rb_root map; 39 struct rb_root map;
40 struct list_head modified_extents;
35 rwlock_t lock; 41 rwlock_t lock;
36}; 42};
37 43
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
60void free_extent_map(struct extent_map *em); 66void free_extent_map(struct extent_map *em);
61int __init extent_map_init(void); 67int __init extent_map_init(void);
62void extent_map_exit(void); 68void extent_map_exit(void);
63int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); 69int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
64struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 70struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
65 u64 start, u64 len); 71 u64 start, u64 len);
66#endif 72#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..1ad08e4e4a15 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "print-tree.h" 26#include "print-tree.h"
27 27
28#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ 28#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
29 sizeof(struct btrfs_item) * 2) / \ 29 sizeof(struct btrfs_item) * 2) / \
30 size) - 1)) 30 size) - 1))
31 31
32#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) 32#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
33 PAGE_CACHE_SIZE))
33 34
34#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
35 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285c6e4d..9ab1bed88116 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h"
42 43
43/* 44/*
44 * when auto defrag is enabled we 45 * when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
458 * this drops all the extents in the cache that intersect the range 459 * this drops all the extents in the cache that intersect the range
459 * [start, end]. Existing extents are split as required. 460 * [start, end]. Existing extents are split as required.
460 */ 461 */
461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 462void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
462 int skip_pinned) 463 int skip_pinned)
463{ 464{
464 struct extent_map *em; 465 struct extent_map *em;
465 struct extent_map *split = NULL; 466 struct extent_map *split = NULL;
466 struct extent_map *split2 = NULL; 467 struct extent_map *split2 = NULL;
467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 468 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
468 u64 len = end - start + 1; 469 u64 len = end - start + 1;
470 u64 gen;
469 int ret; 471 int ret;
470 int testend = 1; 472 int testend = 1;
471 unsigned long flags; 473 unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
477 testend = 0; 479 testend = 0;
478 } 480 }
479 while (1) { 481 while (1) {
482 int no_splits = 0;
483
480 if (!split) 484 if (!split)
481 split = alloc_extent_map(); 485 split = alloc_extent_map();
482 if (!split2) 486 if (!split2)
483 split2 = alloc_extent_map(); 487 split2 = alloc_extent_map();
484 BUG_ON(!split || !split2); /* -ENOMEM */ 488 if (!split || !split2)
489 no_splits = 1;
485 490
486 write_lock(&em_tree->lock); 491 write_lock(&em_tree->lock);
487 em = lookup_extent_mapping(em_tree, start, len); 492 em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
490 break; 495 break;
491 } 496 }
492 flags = em->flags; 497 flags = em->flags;
498 gen = em->generation;
493 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 499 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
494 if (testend && em->start + em->len >= start + len) { 500 if (testend && em->start + em->len >= start + len) {
495 free_extent_map(em); 501 free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
506 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 512 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
507 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 513 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
508 remove_extent_mapping(em_tree, em); 514 remove_extent_mapping(em_tree, em);
515 if (no_splits)
516 goto next;
509 517
510 if (em->block_start < EXTENT_MAP_LAST_BYTE && 518 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
511 em->start < start) { 519 em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
518 split->block_len = em->block_len; 526 split->block_len = em->block_len;
519 else 527 else
520 split->block_len = split->len; 528 split->block_len = split->len;
521 529 split->generation = gen;
522 split->bdev = em->bdev; 530 split->bdev = em->bdev;
523 split->flags = flags; 531 split->flags = flags;
524 split->compress_type = em->compress_type; 532 split->compress_type = em->compress_type;
525 ret = add_extent_mapping(em_tree, split); 533 ret = add_extent_mapping(em_tree, split);
526 BUG_ON(ret); /* Logic error */ 534 BUG_ON(ret); /* Logic error */
535 list_move(&split->list, &em_tree->modified_extents);
527 free_extent_map(split); 536 free_extent_map(split);
528 split = split2; 537 split = split2;
529 split2 = NULL; 538 split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
537 split->bdev = em->bdev; 546 split->bdev = em->bdev;
538 split->flags = flags; 547 split->flags = flags;
539 split->compress_type = em->compress_type; 548 split->compress_type = em->compress_type;
549 split->generation = gen;
540 550
541 if (compressed) { 551 if (compressed) {
542 split->block_len = em->block_len; 552 split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
550 560
551 ret = add_extent_mapping(em_tree, split); 561 ret = add_extent_mapping(em_tree, split);
552 BUG_ON(ret); /* Logic error */ 562 BUG_ON(ret); /* Logic error */
563 list_move(&split->list, &em_tree->modified_extents);
553 free_extent_map(split); 564 free_extent_map(split);
554 split = NULL; 565 split = NULL;
555 } 566 }
567next:
556 write_unlock(&em_tree->lock); 568 write_unlock(&em_tree->lock);
557 569
558 /* once for us */ 570 /* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
564 free_extent_map(split); 576 free_extent_map(split);
565 if (split2) 577 if (split2)
566 free_extent_map(split2); 578 free_extent_map(split2);
567 return 0;
568} 579}
569 580
570/* 581/*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
576 * it is either truncated or split. Anything entirely inside the range 587 * it is either truncated or split. Anything entirely inside the range
577 * is deleted from the tree. 588 * is deleted from the tree.
578 */ 589 */
579int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 590int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
580 u64 start, u64 end, u64 *hint_byte, int drop_cache) 591 struct btrfs_root *root, struct inode *inode,
592 struct btrfs_path *path, u64 start, u64 end,
593 u64 *drop_end, int drop_cache)
581{ 594{
582 struct btrfs_root *root = BTRFS_I(inode)->root;
583 struct extent_buffer *leaf; 595 struct extent_buffer *leaf;
584 struct btrfs_file_extent_item *fi; 596 struct btrfs_file_extent_item *fi;
585 struct btrfs_path *path;
586 struct btrfs_key key; 597 struct btrfs_key key;
587 struct btrfs_key new_key; 598 struct btrfs_key new_key;
588 u64 ino = btrfs_ino(inode); 599 u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
597 int recow; 608 int recow;
598 int ret; 609 int ret;
599 int modify_tree = -1; 610 int modify_tree = -1;
611 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
612 int found = 0;
600 613
601 if (drop_cache) 614 if (drop_cache)
602 btrfs_drop_extent_cache(inode, start, end - 1, 0); 615 btrfs_drop_extent_cache(inode, start, end - 1, 0);
603 616
604 path = btrfs_alloc_path();
605 if (!path)
606 return -ENOMEM;
607
608 if (start >= BTRFS_I(inode)->disk_i_size) 617 if (start >= BTRFS_I(inode)->disk_i_size)
609 modify_tree = 0; 618 modify_tree = 0;
610 619
@@ -666,6 +675,7 @@ next_slot:
666 goto next_slot; 675 goto next_slot;
667 } 676 }
668 677
678 found = 1;
669 search_start = max(key.offset, start); 679 search_start = max(key.offset, start);
670 if (recow || !modify_tree) { 680 if (recow || !modify_tree) {
671 modify_tree = -1; 681 modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
707 extent_end - start); 717 extent_end - start);
708 btrfs_mark_buffer_dirty(leaf); 718 btrfs_mark_buffer_dirty(leaf);
709 719
710 if (disk_bytenr > 0) { 720 if (update_refs && disk_bytenr > 0) {
711 ret = btrfs_inc_extent_ref(trans, root, 721 ret = btrfs_inc_extent_ref(trans, root,
712 disk_bytenr, num_bytes, 0, 722 disk_bytenr, num_bytes, 0,
713 root->root_key.objectid, 723 root->root_key.objectid,
714 new_key.objectid, 724 new_key.objectid,
715 start - extent_offset, 0); 725 start - extent_offset, 0);
716 BUG_ON(ret); /* -ENOMEM */ 726 BUG_ON(ret); /* -ENOMEM */
717 *hint_byte = disk_bytenr;
718 } 727 }
719 key.offset = start; 728 key.offset = start;
720 } 729 }
@@ -734,10 +743,8 @@ next_slot:
734 btrfs_set_file_extent_num_bytes(leaf, fi, 743 btrfs_set_file_extent_num_bytes(leaf, fi,
735 extent_end - end); 744 extent_end - end);
736 btrfs_mark_buffer_dirty(leaf); 745 btrfs_mark_buffer_dirty(leaf);
737 if (disk_bytenr > 0) { 746 if (update_refs && disk_bytenr > 0)
738 inode_sub_bytes(inode, end - key.offset); 747 inode_sub_bytes(inode, end - key.offset);
739 *hint_byte = disk_bytenr;
740 }
741 break; 748 break;
742 } 749 }
743 750
@@ -753,10 +760,8 @@ next_slot:
753 btrfs_set_file_extent_num_bytes(leaf, fi, 760 btrfs_set_file_extent_num_bytes(leaf, fi,
754 start - key.offset); 761 start - key.offset);
755 btrfs_mark_buffer_dirty(leaf); 762 btrfs_mark_buffer_dirty(leaf);
756 if (disk_bytenr > 0) { 763 if (update_refs && disk_bytenr > 0)
757 inode_sub_bytes(inode, extent_end - start); 764 inode_sub_bytes(inode, extent_end - start);
758 *hint_byte = disk_bytenr;
759 }
760 if (end == extent_end) 765 if (end == extent_end)
761 break; 766 break;
762 767
@@ -777,12 +782,13 @@ next_slot:
777 del_nr++; 782 del_nr++;
778 } 783 }
779 784
780 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 785 if (update_refs &&
786 extent_type == BTRFS_FILE_EXTENT_INLINE) {
781 inode_sub_bytes(inode, 787 inode_sub_bytes(inode,
782 extent_end - key.offset); 788 extent_end - key.offset);
783 extent_end = ALIGN(extent_end, 789 extent_end = ALIGN(extent_end,
784 root->sectorsize); 790 root->sectorsize);
785 } else if (disk_bytenr > 0) { 791 } else if (update_refs && disk_bytenr > 0) {
786 ret = btrfs_free_extent(trans, root, 792 ret = btrfs_free_extent(trans, root,
787 disk_bytenr, num_bytes, 0, 793 disk_bytenr, num_bytes, 0,
788 root->root_key.objectid, 794 root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
791 BUG_ON(ret); /* -ENOMEM */ 797 BUG_ON(ret); /* -ENOMEM */
792 inode_sub_bytes(inode, 798 inode_sub_bytes(inode,
793 extent_end - key.offset); 799 extent_end - key.offset);
794 *hint_byte = disk_bytenr;
795 } 800 }
796 801
797 if (end == extent_end) 802 if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
806 del_nr); 811 del_nr);
807 if (ret) { 812 if (ret) {
808 btrfs_abort_transaction(trans, root, ret); 813 btrfs_abort_transaction(trans, root, ret);
809 goto out; 814 break;
810 } 815 }
811 816
812 del_nr = 0; 817 del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
825 btrfs_abort_transaction(trans, root, ret); 830 btrfs_abort_transaction(trans, root, ret);
826 } 831 }
827 832
828out: 833 if (drop_end)
834 *drop_end = found ? min(end, extent_end) : end;
835 btrfs_release_path(path);
836 return ret;
837}
838
839int btrfs_drop_extents(struct btrfs_trans_handle *trans,
840 struct btrfs_root *root, struct inode *inode, u64 start,
841 u64 end, int drop_cache)
842{
843 struct btrfs_path *path;
844 int ret;
845
846 path = btrfs_alloc_path();
847 if (!path)
848 return -ENOMEM;
849 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
850 drop_cache);
829 btrfs_free_path(path); 851 btrfs_free_path(path);
830 return ret; 852 return ret;
831} 853}
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
892 int ret; 914 int ret;
893 u64 ino = btrfs_ino(inode); 915 u64 ino = btrfs_ino(inode);
894 916
895 btrfs_drop_extent_cache(inode, start, end - 1, 0);
896
897 path = btrfs_alloc_path(); 917 path = btrfs_alloc_path();
898 if (!path) 918 if (!path)
899 return -ENOMEM; 919 return -ENOMEM;
@@ -935,12 +955,16 @@ again:
935 btrfs_set_item_key_safe(trans, root, path, &new_key); 955 btrfs_set_item_key_safe(trans, root, path, &new_key);
936 fi = btrfs_item_ptr(leaf, path->slots[0], 956 fi = btrfs_item_ptr(leaf, path->slots[0],
937 struct btrfs_file_extent_item); 957 struct btrfs_file_extent_item);
958 btrfs_set_file_extent_generation(leaf, fi,
959 trans->transid);
938 btrfs_set_file_extent_num_bytes(leaf, fi, 960 btrfs_set_file_extent_num_bytes(leaf, fi,
939 extent_end - end); 961 extent_end - end);
940 btrfs_set_file_extent_offset(leaf, fi, 962 btrfs_set_file_extent_offset(leaf, fi,
941 end - orig_offset); 963 end - orig_offset);
942 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 964 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
943 struct btrfs_file_extent_item); 965 struct btrfs_file_extent_item);
966 btrfs_set_file_extent_generation(leaf, fi,
967 trans->transid);
944 btrfs_set_file_extent_num_bytes(leaf, fi, 968 btrfs_set_file_extent_num_bytes(leaf, fi,
945 end - other_start); 969 end - other_start);
946 btrfs_mark_buffer_dirty(leaf); 970 btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
958 struct btrfs_file_extent_item); 982 struct btrfs_file_extent_item);
959 btrfs_set_file_extent_num_bytes(leaf, fi, 983 btrfs_set_file_extent_num_bytes(leaf, fi,
960 start - key.offset); 984 start - key.offset);
985 btrfs_set_file_extent_generation(leaf, fi,
986 trans->transid);
961 path->slots[0]++; 987 path->slots[0]++;
962 new_key.offset = start; 988 new_key.offset = start;
963 btrfs_set_item_key_safe(trans, root, path, &new_key); 989 btrfs_set_item_key_safe(trans, root, path, &new_key);
964 990
965 fi = btrfs_item_ptr(leaf, path->slots[0], 991 fi = btrfs_item_ptr(leaf, path->slots[0],
966 struct btrfs_file_extent_item); 992 struct btrfs_file_extent_item);
993 btrfs_set_file_extent_generation(leaf, fi,
994 trans->transid);
967 btrfs_set_file_extent_num_bytes(leaf, fi, 995 btrfs_set_file_extent_num_bytes(leaf, fi,
968 other_end - start); 996 other_end - start);
969 btrfs_set_file_extent_offset(leaf, fi, 997 btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
991 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
992 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1020 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
993 struct btrfs_file_extent_item); 1021 struct btrfs_file_extent_item);
1022 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
994 btrfs_set_file_extent_num_bytes(leaf, fi, 1023 btrfs_set_file_extent_num_bytes(leaf, fi,
995 split - key.offset); 1024 split - key.offset);
996 1025
997 fi = btrfs_item_ptr(leaf, path->slots[0], 1026 fi = btrfs_item_ptr(leaf, path->slots[0],
998 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
999 1028
1029 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1000 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1030 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1001 btrfs_set_file_extent_num_bytes(leaf, fi, 1031 btrfs_set_file_extent_num_bytes(leaf, fi,
1002 extent_end - split); 1032 extent_end - split);
@@ -1056,12 +1086,14 @@ again:
1056 struct btrfs_file_extent_item); 1086 struct btrfs_file_extent_item);
1057 btrfs_set_file_extent_type(leaf, fi, 1087 btrfs_set_file_extent_type(leaf, fi,
1058 BTRFS_FILE_EXTENT_REG); 1088 BTRFS_FILE_EXTENT_REG);
1089 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1059 btrfs_mark_buffer_dirty(leaf); 1090 btrfs_mark_buffer_dirty(leaf);
1060 } else { 1091 } else {
1061 fi = btrfs_item_ptr(leaf, del_slot - 1, 1092 fi = btrfs_item_ptr(leaf, del_slot - 1,
1062 struct btrfs_file_extent_item); 1093 struct btrfs_file_extent_item);
1063 btrfs_set_file_extent_type(leaf, fi, 1094 btrfs_set_file_extent_type(leaf, fi,
1064 BTRFS_FILE_EXTENT_REG); 1095 BTRFS_FILE_EXTENT_REG);
1096 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1065 btrfs_set_file_extent_num_bytes(leaf, fi, 1097 btrfs_set_file_extent_num_bytes(leaf, fi,
1066 extent_end - key.offset); 1098 extent_end - key.offset);
1067 btrfs_mark_buffer_dirty(leaf); 1099 btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
1173 1205
1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1206 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1175 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1207 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1176 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1208 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1177 GFP_NOFS); 1209 0, 0, &cached_state, GFP_NOFS);
1178 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1210 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1179 start_pos, last_pos - 1, &cached_state, 1211 start_pos, last_pos - 1, &cached_state,
1180 GFP_NOFS); 1212 GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1514 1546
1515 trace_btrfs_sync_file(file, datasync); 1547 trace_btrfs_sync_file(file, datasync);
1516 1548
1549 /*
1550 * We write the dirty pages in the range and wait until they complete
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up.
1553 */
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1555 if (ret)
1556 return ret;
1557
1517 mutex_lock(&inode->i_mutex); 1558 mutex_lock(&inode->i_mutex);
1518 1559
1519 /* 1560 /*
1520 * we wait first, since the writeback may change the inode, also wait 1561 * We flush the dirty pages again to avoid some dirty pages in the
1521 * ordered range does a filemape_write_and_wait_range which is why we 1562 * range being left.
1522 * don't do it above like other file systems.
1523 */ 1563 */
1524 root->log_batch++; 1564 atomic_inc(&root->log_batch);
1525 btrfs_wait_ordered_range(inode, start, end); 1565 btrfs_wait_ordered_range(inode, start, end);
1526 root->log_batch++; 1566 atomic_inc(&root->log_batch);
1527 1567
1528 /* 1568 /*
1529 * check the transaction that last modified this inode 1569 * check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1544 BTRFS_I(inode)->last_trans <= 1584 BTRFS_I(inode)->last_trans <=
1545 root->fs_info->last_trans_committed) { 1585 root->fs_info->last_trans_committed) {
1546 BTRFS_I(inode)->last_trans = 0; 1586 BTRFS_I(inode)->last_trans = 0;
1587
1588 /*
1589 * We'v had everything committed since the last time we were
1590 * modified so clear this flag in case it was set for whatever
1591 * reason, it's no longer relevant.
1592 */
1593 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1594 &BTRFS_I(inode)->runtime_flags);
1547 mutex_unlock(&inode->i_mutex); 1595 mutex_unlock(&inode->i_mutex);
1548 goto out; 1596 goto out;
1549 } 1597 }
@@ -1599,6 +1647,7 @@ out:
1599static const struct vm_operations_struct btrfs_file_vm_ops = { 1647static const struct vm_operations_struct btrfs_file_vm_ops = {
1600 .fault = filemap_fault, 1648 .fault = filemap_fault,
1601 .page_mkwrite = btrfs_page_mkwrite, 1649 .page_mkwrite = btrfs_page_mkwrite,
1650 .remap_pages = generic_file_remap_pages,
1602}; 1651};
1603 1652
1604static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1653static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1610,11 +1659,328 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1610 1659
1611 file_accessed(filp); 1660 file_accessed(filp);
1612 vma->vm_ops = &btrfs_file_vm_ops; 1661 vma->vm_ops = &btrfs_file_vm_ops;
1613 vma->vm_flags |= VM_CAN_NONLINEAR;
1614 1662
1615 return 0; 1663 return 0;
1616} 1664}
1617 1665
1666static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
1667 int slot, u64 start, u64 end)
1668{
1669 struct btrfs_file_extent_item *fi;
1670 struct btrfs_key key;
1671
1672 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1673 return 0;
1674
1675 btrfs_item_key_to_cpu(leaf, &key, slot);
1676 if (key.objectid != btrfs_ino(inode) ||
1677 key.type != BTRFS_EXTENT_DATA_KEY)
1678 return 0;
1679
1680 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1681
1682 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1683 return 0;
1684
1685 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1686 return 0;
1687
1688 if (key.offset == end)
1689 return 1;
1690 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1691 return 1;
1692 return 0;
1693}
1694
1695static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1696 struct btrfs_path *path, u64 offset, u64 end)
1697{
1698 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 struct extent_buffer *leaf;
1700 struct btrfs_file_extent_item *fi;
1701 struct extent_map *hole_em;
1702 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1703 struct btrfs_key key;
1704 int ret;
1705
1706 key.objectid = btrfs_ino(inode);
1707 key.type = BTRFS_EXTENT_DATA_KEY;
1708 key.offset = offset;
1709
1710
1711 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1712 if (ret < 0)
1713 return ret;
1714 BUG_ON(!ret);
1715
1716 leaf = path->nodes[0];
1717 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1718 u64 num_bytes;
1719
1720 path->slots[0]--;
1721 fi = btrfs_item_ptr(leaf, path->slots[0],
1722 struct btrfs_file_extent_item);
1723 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1724 end - offset;
1725 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1726 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1727 btrfs_set_file_extent_offset(leaf, fi, 0);
1728 btrfs_mark_buffer_dirty(leaf);
1729 goto out;
1730 }
1731
1732 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1733 u64 num_bytes;
1734
1735 path->slots[0]++;
1736 key.offset = offset;
1737 btrfs_set_item_key_safe(trans, root, path, &key);
1738 fi = btrfs_item_ptr(leaf, path->slots[0],
1739 struct btrfs_file_extent_item);
1740 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1741 offset;
1742 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1743 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1744 btrfs_set_file_extent_offset(leaf, fi, 0);
1745 btrfs_mark_buffer_dirty(leaf);
1746 goto out;
1747 }
1748 btrfs_release_path(path);
1749
1750 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1751 0, 0, end - offset, 0, end - offset,
1752 0, 0, 0);
1753 if (ret)
1754 return ret;
1755
1756out:
1757 btrfs_release_path(path);
1758
1759 hole_em = alloc_extent_map();
1760 if (!hole_em) {
1761 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1762 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1763 &BTRFS_I(inode)->runtime_flags);
1764 } else {
1765 hole_em->start = offset;
1766 hole_em->len = end - offset;
1767 hole_em->orig_start = offset;
1768
1769 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid;
1774
1775 do {
1776 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1777 write_lock(&em_tree->lock);
1778 ret = add_extent_mapping(em_tree, hole_em);
1779 if (!ret)
1780 list_move(&hole_em->list,
1781 &em_tree->modified_extents);
1782 write_unlock(&em_tree->lock);
1783 } while (ret == -EEXIST);
1784 free_extent_map(hole_em);
1785 if (ret)
1786 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1787 &BTRFS_I(inode)->runtime_flags);
1788 }
1789
1790 return 0;
1791}
1792
1793static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1794{
1795 struct btrfs_root *root = BTRFS_I(inode)->root;
1796 struct extent_state *cached_state = NULL;
1797 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1801 u64 lockstart = (offset + mask) & ~mask;
1802 u64 lockend = ((offset + len) & ~mask) - 1;
1803 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0;
1808 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT);
1811
1812 btrfs_wait_ordered_range(inode, offset, len);
1813
1814 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) {
1816 mutex_unlock(&inode->i_mutex);
1817 return 0;
1818 }
1819
1820 /*
1821 * Only do this if we are in the same page and we aren't doing the
1822 * entire page.
1823 */
1824 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex);
1827 return ret;
1828 }
1829
1830 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 if (ret) {
1833 mutex_unlock(&inode->i_mutex);
1834 return ret;
1835 }
1836
1837 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 if (ret) {
1840 mutex_unlock(&inode->i_mutex);
1841 return ret;
1842 }
1843
1844 if (lockend < lockstart) {
1845 mutex_unlock(&inode->i_mutex);
1846 return 0;
1847 }
1848
1849 while (1) {
1850 struct btrfs_ordered_extent *ordered;
1851
1852 truncate_pagecache_range(inode, lockstart, lockend);
1853
1854 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1855 0, &cached_state);
1856 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1857
1858 /*
1859 * We need to make sure we have no ordered extents in this range
1860 * and nobody raced in and read a page in this range, if we did
1861 * we need to try again.
1862 */
1863 if ((!ordered ||
1864 (ordered->file_offset + ordered->len < lockstart ||
1865 ordered->file_offset > lockend)) &&
1866 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1867 lockend, EXTENT_UPTODATE, 0,
1868 cached_state)) {
1869 if (ordered)
1870 btrfs_put_ordered_extent(ordered);
1871 break;
1872 }
1873 if (ordered)
1874 btrfs_put_ordered_extent(ordered);
1875 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1876 lockend, &cached_state, GFP_NOFS);
1877 btrfs_wait_ordered_range(inode, lockstart,
1878 lockend - lockstart + 1);
1879 }
1880
1881 path = btrfs_alloc_path();
1882 if (!path) {
1883 ret = -ENOMEM;
1884 goto out;
1885 }
1886
1887 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1888 if (!rsv) {
1889 ret = -ENOMEM;
1890 goto out_free;
1891 }
1892 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1893 rsv->failfast = 1;
1894
1895 /*
1896 * 1 - update the inode
1897 * 1 - removing the extents in the range
1898 * 1 - adding the hole extent
1899 */
1900 trans = btrfs_start_transaction(root, 3);
1901 if (IS_ERR(trans)) {
1902 err = PTR_ERR(trans);
1903 goto out_free;
1904 }
1905
1906 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
1907 min_size);
1908 BUG_ON(ret);
1909 trans->block_rsv = rsv;
1910
1911 while (cur_offset < lockend) {
1912 ret = __btrfs_drop_extents(trans, root, inode, path,
1913 cur_offset, lockend + 1,
1914 &drop_end, 1);
1915 if (ret != -ENOSPC)
1916 break;
1917
1918 trans->block_rsv = &root->fs_info->trans_block_rsv;
1919
1920 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1921 if (ret) {
1922 err = ret;
1923 break;
1924 }
1925
1926 cur_offset = drop_end;
1927
1928 ret = btrfs_update_inode(trans, root, inode);
1929 if (ret) {
1930 err = ret;
1931 break;
1932 }
1933
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr);
1937
1938 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) {
1940 ret = PTR_ERR(trans);
1941 trans = NULL;
1942 break;
1943 }
1944
1945 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
1946 rsv, min_size);
1947 BUG_ON(ret); /* shouldn't happen */
1948 trans->block_rsv = rsv;
1949 }
1950
1951 if (ret) {
1952 err = ret;
1953 goto out_trans;
1954 }
1955
1956 trans->block_rsv = &root->fs_info->trans_block_rsv;
1957 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1958 if (ret) {
1959 err = ret;
1960 goto out_trans;
1961 }
1962
1963out_trans:
1964 if (!trans)
1965 goto out_free;
1966
1967 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr);
1972out_free:
1973 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv);
1975out:
1976 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1977 &cached_state, GFP_NOFS);
1978 mutex_unlock(&inode->i_mutex);
1979 if (ret && !err)
1980 err = ret;
1981 return err;
1982}
1983
1618static long btrfs_fallocate(struct file *file, int mode, 1984static long btrfs_fallocate(struct file *file, int mode,
1619 loff_t offset, loff_t len) 1985 loff_t offset, loff_t len)
1620{ 1986{
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
1633 alloc_start = offset & ~mask; 1999 alloc_start = offset & ~mask;
1634 alloc_end = (offset + len + mask) & ~mask; 2000 alloc_end = (offset + len + mask) & ~mask;
1635 2001
1636 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 2002 /* Make sure we aren't being give some crap mode */
1637 if (mode & ~FALLOC_FL_KEEP_SIZE) 2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1638 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
1639 2005
2006 if (mode & FALLOC_FL_PUNCH_HOLE)
2007 return btrfs_punch_hole(inode, offset, len);
2008
1640 /* 2009 /*
1641 * Make sure we have enough space before we do the 2010 * Make sure we have enough space before we do the
1642 * allocation. 2011 * allocation.
1643 */ 2012 */
1644 ret = btrfs_check_data_free_space(inode, len); 2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
1645 if (ret) 2014 if (ret)
1646 return ret; 2015 return ret;
1647 2016
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
1748out: 2117out:
1749 mutex_unlock(&inode->i_mutex); 2118 mutex_unlock(&inode->i_mutex);
1750 /* Let go of our reservation. */ 2119 /* Let go of our reservation. */
1751 btrfs_free_reserved_data_space(inode, len); 2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
1752 return ret; 2121 return ret;
1753} 2122}
1754 2123
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..1027b854b90c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
966 block_group->key.offset)) { 966 block_group->key.offset)) {
967 ret = find_first_extent_bit(unpin, start, 967 ret = find_first_extent_bit(unpin, start,
968 &extent_start, &extent_end, 968 &extent_start, &extent_end,
969 EXTENT_DIRTY); 969 EXTENT_DIRTY, NULL);
970 if (ret) { 970 if (ret) {
971 ret = 0; 971 ret = 0;
972 break; 972 break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1454 max_t(u64, *offset, bitmap_info->offset)); 1454 max_t(u64, *offset, bitmap_info->offset));
1455 bits = bytes_to_bits(*bytes, ctl->unit); 1455 bits = bytes_to_bits(*bytes, ctl->unit);
1456 1456
1457 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1457 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1458 i < BITS_PER_BITMAP;
1459 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1460 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1458 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1461 BITS_PER_BITMAP, i); 1459 BITS_PER_BITMAP, i);
1462 if ((next_zero - i) >= bits) { 1460 if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2307 2305
2308again: 2306again:
2309 found_bits = 0; 2307 found_bits = 0;
2310 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 2308 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2311 i < BITS_PER_BITMAP;
2312 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2313 next_zero = find_next_zero_bit(entry->bitmap, 2309 next_zero = find_next_zero_bit(entry->bitmap,
2314 BITS_PER_BITMAP, i); 2310 BITS_PER_BITMAP, i);
2315 if (next_zero - i >= min_bits) { 2311 if (next_zero - i >= min_bits) {
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27
28/*
29 * Figure the key offset of an extended inode ref
30 */
31static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
32 int len)
33{
34 return (u64) crc32c(parent_objectid, name, len);
35}
36
27#endif 37#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
18 18
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "hash.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
50 return 0; 51 return 0;
51} 52}
52 53
53struct btrfs_inode_ref * 54int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
55 const char *name, int name_len,
56 struct btrfs_inode_extref **extref_ret)
57{
58 struct extent_buffer *leaf;
59 struct btrfs_inode_extref *extref;
60 unsigned long ptr;
61 unsigned long name_ptr;
62 u32 item_size;
63 u32 cur_offset = 0;
64 int ref_name_len;
65
66 leaf = path->nodes[0];
67 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
68 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
69
70 /*
71 * Search all extended backrefs in this item. We're only
72 * looking through any collisions so most of the time this is
73 * just going to compare against one buffer. If all is well,
74 * we'll return success and the inode ref object.
75 */
76 while (cur_offset < item_size) {
77 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
78 name_ptr = (unsigned long)(&extref->name);
79 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
80
81 if (ref_name_len == name_len &&
82 btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
83 (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
84 if (extref_ret)
85 *extref_ret = extref;
86 return 1;
87 }
88
89 cur_offset += ref_name_len + sizeof(*extref);
90 }
91 return 0;
92}
93
94static struct btrfs_inode_ref *
54btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 95btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, 96 struct btrfs_root *root,
56 struct btrfs_path *path, 97 struct btrfs_path *path,
57 const char *name, int name_len, 98 const char *name, int name_len,
58 u64 inode_objectid, u64 ref_objectid, int mod) 99 u64 inode_objectid, u64 ref_objectid, int ins_len,
100 int cow)
59{ 101{
102 int ret;
60 struct btrfs_key key; 103 struct btrfs_key key;
61 struct btrfs_inode_ref *ref; 104 struct btrfs_inode_ref *ref;
62 int ins_len = mod < 0 ? -1 : 0;
63 int cow = mod != 0;
64 int ret;
65 105
66 key.objectid = inode_objectid; 106 key.objectid = inode_objectid;
67 key.type = BTRFS_INODE_REF_KEY; 107 key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
77 return ref; 117 return ref;
78} 118}
79 119
80int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 120/* Returns NULL if no extref found */
121struct btrfs_inode_extref *
122btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
123 struct btrfs_root *root,
124 struct btrfs_path *path,
125 const char *name, int name_len,
126 u64 inode_objectid, u64 ref_objectid, int ins_len,
127 int cow)
128{
129 int ret;
130 struct btrfs_key key;
131 struct btrfs_inode_extref *extref;
132
133 key.objectid = inode_objectid;
134 key.type = BTRFS_INODE_EXTREF_KEY;
135 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
136
137 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
138 if (ret < 0)
139 return ERR_PTR(ret);
140 if (ret > 0)
141 return NULL;
142 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
143 return NULL;
144 return extref;
145}
146
147int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
148 struct btrfs_root *root,
149 struct btrfs_path *path,
150 const char *name, int name_len,
151 u64 inode_objectid, u64 ref_objectid, int mod,
152 u64 *ret_index)
153{
154 struct btrfs_inode_ref *ref;
155 struct btrfs_inode_extref *extref;
156 int ins_len = mod < 0 ? -1 : 0;
157 int cow = mod != 0;
158
159 ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
160 inode_objectid, ref_objectid, ins_len,
161 cow);
162 if (IS_ERR(ref))
163 return PTR_ERR(ref);
164
165 if (ref != NULL) {
166 *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
167 return 0;
168 }
169
170 btrfs_release_path(path);
171
172 extref = btrfs_lookup_inode_extref(trans, root, path, name,
173 name_len, inode_objectid,
174 ref_objectid, ins_len, cow);
175 if (IS_ERR(extref))
176 return PTR_ERR(extref);
177
178 if (extref) {
179 *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
180 return 0;
181 }
182
183 return -ENOENT;
184}
185
186int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root, 187 struct btrfs_root *root,
82 const char *name, int name_len, 188 const char *name, int name_len,
83 u64 inode_objectid, u64 ref_objectid, u64 *index) 189 u64 inode_objectid, u64 ref_objectid, u64 *index)
84{ 190{
85 struct btrfs_path *path; 191 struct btrfs_path *path;
86 struct btrfs_key key; 192 struct btrfs_key key;
193 struct btrfs_inode_extref *extref;
194 struct extent_buffer *leaf;
195 int ret;
196 int del_len = name_len + sizeof(*extref);
197 unsigned long ptr;
198 unsigned long item_start;
199 u32 item_size;
200
201 key.objectid = inode_objectid;
202 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
203 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
204
205 path = btrfs_alloc_path();
206 if (!path)
207 return -ENOMEM;
208
209 path->leave_spinning = 1;
210
211 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
212 if (ret > 0)
213 ret = -ENOENT;
214 if (ret < 0)
215 goto out;
216
217 /*
218 * Sanity check - did we find the right item for this name?
219 * This should always succeed so error here will make the FS
220 * readonly.
221 */
222 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
223 name, name_len, &extref)) {
224 btrfs_std_error(root->fs_info, -ENOENT);
225 ret = -EROFS;
226 goto out;
227 }
228
229 leaf = path->nodes[0];
230 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
231 if (index)
232 *index = btrfs_inode_extref_index(leaf, extref);
233
234 if (del_len == item_size) {
235 /*
236 * Common case only one ref in the item, remove the
237 * whole item.
238 */
239 ret = btrfs_del_item(trans, root, path);
240 goto out;
241 }
242
243 ptr = (unsigned long)extref;
244 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
245
246 memmove_extent_buffer(leaf, ptr, ptr + del_len,
247 item_size - (ptr + del_len - item_start));
248
249 btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
250
251out:
252 btrfs_free_path(path);
253
254 return ret;
255}
256
257int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
258 struct btrfs_root *root,
259 const char *name, int name_len,
260 u64 inode_objectid, u64 ref_objectid, u64 *index)
261{
262 struct btrfs_path *path;
263 struct btrfs_key key;
87 struct btrfs_inode_ref *ref; 264 struct btrfs_inode_ref *ref;
88 struct extent_buffer *leaf; 265 struct extent_buffer *leaf;
89 unsigned long ptr; 266 unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
91 u32 item_size; 268 u32 item_size;
92 u32 sub_item_len; 269 u32 sub_item_len;
93 int ret; 270 int ret;
271 int search_ext_refs = 0;
94 int del_len = name_len + sizeof(*ref); 272 int del_len = name_len + sizeof(*ref);
95 273
96 key.objectid = inode_objectid; 274 key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
106 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 284 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
107 if (ret > 0) { 285 if (ret > 0) {
108 ret = -ENOENT; 286 ret = -ENOENT;
287 search_ext_refs = 1;
109 goto out; 288 goto out;
110 } else if (ret < 0) { 289 } else if (ret < 0) {
111 goto out; 290 goto out;
112 } 291 }
113 if (!find_name_in_backref(path, name, name_len, &ref)) { 292 if (!find_name_in_backref(path, name, name_len, &ref)) {
114 ret = -ENOENT; 293 ret = -ENOENT;
294 search_ext_refs = 1;
115 goto out; 295 goto out;
116 } 296 }
117 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
129 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); 309 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
130 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, 310 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
131 item_size - (ptr + sub_item_len - item_start)); 311 item_size - (ptr + sub_item_len - item_start));
132 btrfs_truncate_item(trans, root, path, 312 btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
133 item_size - sub_item_len, 1); 313out:
314 btrfs_free_path(path);
315
316 if (search_ext_refs) {
317 /*
318 * No refs were found, or we could not find the
319 * name in our ref array. Find and remove the extended
320 * inode ref then.
321 */
322 return btrfs_del_inode_extref(trans, root, name, name_len,
323 inode_objectid, ref_objectid, index);
324 }
325
326 return ret;
327}
328
329/*
330 * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
331 *
332 * The caller must have checked against BTRFS_LINK_MAX already.
333 */
334static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
335 struct btrfs_root *root,
336 const char *name, int name_len,
337 u64 inode_objectid, u64 ref_objectid, u64 index)
338{
339 struct btrfs_inode_extref *extref;
340 int ret;
341 int ins_len = name_len + sizeof(*extref);
342 unsigned long ptr;
343 struct btrfs_path *path;
344 struct btrfs_key key;
345 struct extent_buffer *leaf;
346 struct btrfs_item *item;
347
348 key.objectid = inode_objectid;
349 key.type = BTRFS_INODE_EXTREF_KEY;
350 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
351
352 path = btrfs_alloc_path();
353 if (!path)
354 return -ENOMEM;
355
356 path->leave_spinning = 1;
357 ret = btrfs_insert_empty_item(trans, root, path, &key,
358 ins_len);
359 if (ret == -EEXIST) {
360 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
361 name, name_len, NULL))
362 goto out;
363
364 btrfs_extend_item(trans, root, path, ins_len);
365 ret = 0;
366 }
367 if (ret < 0)
368 goto out;
369
370 leaf = path->nodes[0];
371 item = btrfs_item_nr(leaf, path->slots[0]);
372 ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
373 ptr += btrfs_item_size(leaf, item) - ins_len;
374 extref = (struct btrfs_inode_extref *)ptr;
375
376 btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
377 btrfs_set_inode_extref_index(path->nodes[0], extref, index);
378 btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
379
380 ptr = (unsigned long)&extref->name;
381 write_extent_buffer(path->nodes[0], name, ptr, name_len);
382 btrfs_mark_buffer_dirty(path->nodes[0]);
383
134out: 384out:
135 btrfs_free_path(path); 385 btrfs_free_path(path);
136 return ret; 386 return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
191 441
192out: 442out:
193 btrfs_free_path(path); 443 btrfs_free_path(path);
444
445 if (ret == -EMLINK) {
446 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
447 /* We ran out of space in the ref array. Need to
448 * add an extended ref. */
449 if (btrfs_super_incompat_flags(disk_super)
450 & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
451 ret = btrfs_insert_inode_extref(trans, root, name,
452 name_len,
453 inode_objectid,
454 ref_objectid, index);
455 }
456
194 return ret; 457 return ret;
195} 458}
196 459
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec154f954646..85a1e5053fe6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
230 u64 inline_len = actual_end - start; 230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) & 231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1); 232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len; 233 u64 data_len = inline_len;
235 int ret; 234 int ret;
236 235
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
247 return 1; 246 return 1;
248 } 247 }
249 248
250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 249 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
251 &hint_byte, 1);
252 if (ret) 250 if (ret)
253 return ret; 251 return ret;
254 252
@@ -664,7 +662,7 @@ retry:
664 async_extent->compressed_size, 662 async_extent->compressed_size,
665 async_extent->compressed_size, 663 async_extent->compressed_size,
666 0, alloc_hint, &ins, 1); 664 0, alloc_hint, &ins, 1);
667 if (ret) 665 if (ret && ret != -ENOSPC)
668 btrfs_abort_transaction(trans, root, ret); 666 btrfs_abort_transaction(trans, root, ret);
669 btrfs_end_transaction(trans, root); 667 btrfs_end_transaction(trans, root);
670 } 668 }
@@ -1308,6 +1306,7 @@ out_check:
1308 em->block_start = disk_bytenr; 1306 em->block_start = disk_bytenr;
1309 em->bdev = root->fs_info->fs_devices->latest_bdev; 1307 em->bdev = root->fs_info->fs_devices->latest_bdev;
1310 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1308 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1309 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
1311 while (1) { 1310 while (1) {
1312 write_lock(&em_tree->lock); 1311 write_lock(&em_tree->lock);
1313 ret = add_extent_mapping(em_tree, em); 1312 ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
1364 } 1363 }
1365 1364
1366error: 1365error:
1367 if (nolock) { 1366 err = btrfs_end_transaction(trans, root);
1368 err = btrfs_end_transaction_nolock(trans, root);
1369 } else {
1370 err = btrfs_end_transaction(trans, root);
1371 }
1372 if (!ret) 1367 if (!ret)
1373 ret = err; 1368 ret = err;
1374 1369
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1785 struct btrfs_path *path; 1780 struct btrfs_path *path;
1786 struct extent_buffer *leaf; 1781 struct extent_buffer *leaf;
1787 struct btrfs_key ins; 1782 struct btrfs_key ins;
1788 u64 hint;
1789 int ret; 1783 int ret;
1790 1784
1791 path = btrfs_alloc_path(); 1785 path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1803 * the caller is expected to unpin it and allow it to be merged 1797 * the caller is expected to unpin it and allow it to be merged
1804 * with the others. 1798 * with the others.
1805 */ 1799 */
1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1800 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1807 &hint, 0); 1801 file_pos + num_bytes, 0);
1808 if (ret) 1802 if (ret)
1809 goto out; 1803 goto out;
1810 1804
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1828 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1822 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1829 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1823 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1830 1824
1831 btrfs_unlock_up_safe(path, 1);
1832 btrfs_set_lock_blocking(leaf);
1833
1834 btrfs_mark_buffer_dirty(leaf); 1825 btrfs_mark_buffer_dirty(leaf);
1826 btrfs_release_path(path);
1835 1827
1836 inode_add_bytes(inode, num_bytes); 1828 inode_add_bytes(inode, num_bytes);
1837 1829
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1929 ordered_extent->len, 1921 ordered_extent->len,
1930 compress_type, 0, 0, 1922 compress_type, 0, 0,
1931 BTRFS_FILE_EXTENT_REG); 1923 BTRFS_FILE_EXTENT_REG);
1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1933 ordered_extent->file_offset,
1934 ordered_extent->len);
1935 } 1924 }
1936 1925 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1926 ordered_extent->file_offset, ordered_extent->len,
1927 trans->transid);
1937 if (ret < 0) { 1928 if (ret < 0) {
1938 btrfs_abort_transaction(trans, root, ret); 1929 btrfs_abort_transaction(trans, root, ret);
1939 goto out_unlock; 1930 goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1949 btrfs_abort_transaction(trans, root, ret); 1940 btrfs_abort_transaction(trans, root, ret);
1950 goto out_unlock; 1941 goto out_unlock;
1951 } 1942 }
1943 } else {
1944 btrfs_set_inode_last_trans(trans, inode);
1952 } 1945 }
1953 ret = 0; 1946 ret = 0;
1954out_unlock: 1947out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
1958out: 1951out:
1959 if (root != root->fs_info->tree_root) 1952 if (root != root->fs_info->tree_root)
1960 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1953 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1961 if (trans) { 1954 if (trans)
1962 if (nolock) 1955 btrfs_end_transaction(trans, root);
1963 btrfs_end_transaction_nolock(trans, root);
1964 else
1965 btrfs_end_transaction(trans, root);
1966 }
1967 1956
1968 if (ret) 1957 if (ret)
1969 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1958 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -1971,8 +1960,8 @@ out:
1971 ordered_extent->len - 1, NULL, GFP_NOFS); 1960 ordered_extent->len - 1, NULL, GFP_NOFS);
1972 1961
1973 /* 1962 /*
1974 * This needs to be dont to make sure anybody waiting knows we are done 1963 * This needs to be done to make sure anybody waiting knows we are done
1975 * upating everything for this ordered extent. 1964 * updating everything for this ordered extent.
1976 */ 1965 */
1977 btrfs_remove_ordered_extent(inode, ordered_extent); 1966 btrfs_remove_ordered_extent(inode, ordered_extent);
1978 1967
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2119 if (empty) 2108 if (empty)
2120 return; 2109 return;
2121 2110
2122 down_read(&root->fs_info->cleanup_work_sem);
2123 spin_lock(&fs_info->delayed_iput_lock); 2111 spin_lock(&fs_info->delayed_iput_lock);
2124 list_splice_init(&fs_info->delayed_iputs, &list); 2112 list_splice_init(&fs_info->delayed_iputs, &list);
2125 spin_unlock(&fs_info->delayed_iput_lock); 2113 spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2130 iput(delayed->inode); 2118 iput(delayed->inode);
2131 kfree(delayed); 2119 kfree(delayed);
2132 } 2120 }
2133 up_read(&root->fs_info->cleanup_work_sem);
2134} 2121}
2135 2122
2136enum btrfs_orphan_cleanup_state { 2123enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2198 int ret; 2185 int ret;
2199 2186
2200 if (!root->orphan_block_rsv) { 2187 if (!root->orphan_block_rsv) {
2201 block_rsv = btrfs_alloc_block_rsv(root); 2188 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2202 if (!block_rsv) 2189 if (!block_rsv)
2203 return -ENOMEM; 2190 return -ENOMEM;
2204 } 2191 }
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2225 insert = 1; 2212 insert = 1;
2226#endif 2213#endif
2227 insert = 1; 2214 insert = 1;
2228 atomic_dec(&root->orphan_inodes); 2215 atomic_inc(&root->orphan_inodes);
2229 } 2216 }
2230 2217
2231 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2218 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2572,8 +2559,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
2572 struct btrfs_inode_item); 2559 struct btrfs_inode_item);
2573 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2560 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2574 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2561 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2575 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2562 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
2576 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2563 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
2577 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2564 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2578 2565
2579 tspec = btrfs_inode_atime(inode_item); 2566 tspec = btrfs_inode_atime(inode_item);
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2590 2577
2591 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2578 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2592 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2579 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2580 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2581
2582 /*
2583 * If we were modified in the current generation and evicted from memory
2584 * and then re-read we need to do a full sync since we don't have any
2585 * idea about which extents were modified before we were evicted from
2586 * cache.
2587 */
2588 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2589 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2590 &BTRFS_I(inode)->runtime_flags);
2591
2593 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2592 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2594 inode->i_generation = BTRFS_I(inode)->generation; 2593 inode->i_generation = BTRFS_I(inode)->generation;
2595 inode->i_rdev = 0; 2594 inode->i_rdev = 0;
@@ -2651,8 +2650,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_inode_item *item, 2650 struct btrfs_inode_item *item,
2652 struct inode *inode) 2651 struct inode *inode)
2653{ 2652{
2654 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2653 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
2655 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2654 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2656 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2655 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2657 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2656 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2658 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2657 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2894 struct btrfs_trans_handle *trans; 2893 struct btrfs_trans_handle *trans;
2895 struct btrfs_root *root = BTRFS_I(dir)->root; 2894 struct btrfs_root *root = BTRFS_I(dir)->root;
2896 struct btrfs_path *path; 2895 struct btrfs_path *path;
2897 struct btrfs_inode_ref *ref;
2898 struct btrfs_dir_item *di; 2896 struct btrfs_dir_item *di;
2899 struct inode *inode = dentry->d_inode; 2897 struct inode *inode = dentry->d_inode;
2900 u64 index; 2898 u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3008 } 3006 }
3009 btrfs_release_path(path); 3007 btrfs_release_path(path);
3010 3008
3011 ref = btrfs_lookup_inode_ref(trans, root, path, 3009 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3012 dentry->d_name.name, dentry->d_name.len, 3010 dentry->d_name.len, ino, dir_ino, 0,
3013 ino, dir_ino, 0); 3011 &index);
3014 if (IS_ERR(ref)) { 3012 if (ret) {
3015 err = PTR_ERR(ref); 3013 err = ret;
3016 goto out; 3014 goto out;
3017 } 3015 }
3018 BUG_ON(!ref); /* Logic error */ 3016
3019 if (check_path_shared(root, path)) 3017 if (check_path_shared(root, path))
3020 goto out; 3018 goto out;
3021 index = btrfs_inode_ref_index(path->nodes[0], ref); 3019
3022 btrfs_release_path(path); 3020 btrfs_release_path(path);
3023 3021
3024 /* 3022 /*
@@ -3061,7 +3059,7 @@ out:
3061static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3059static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3062 struct btrfs_root *root) 3060 struct btrfs_root *root)
3063{ 3061{
3064 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3062 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3065 btrfs_block_rsv_release(root, trans->block_rsv, 3063 btrfs_block_rsv_release(root, trans->block_rsv,
3066 trans->bytes_reserved); 3064 trans->bytes_reserved);
3067 trans->block_rsv = &root->fs_info->trans_block_rsv; 3065 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3191 struct btrfs_trans_handle *trans; 3189 struct btrfs_trans_handle *trans;
3192 unsigned long nr = 0; 3190 unsigned long nr = 0;
3193 3191
3194 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3192 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3195 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3196 return -ENOTEMPTY; 3193 return -ENOTEMPTY;
3194 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3195 return -EPERM;
3197 3196
3198 trans = __unlink_start_trans(dir, dentry); 3197 trans = __unlink_start_trans(dir, dentry);
3199 if (IS_ERR(trans)) 3198 if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3267 return -ENOMEM; 3266 return -ENOMEM;
3268 path->reada = -1; 3267 path->reada = -1;
3269 3268
3269 /*
3270 * We want to drop from the next block forward in case this new size is
3271 * not block aligned since we will be keeping the last block of the
3272 * extent just the way it is.
3273 */
3270 if (root->ref_cows || root == root->fs_info->tree_root) 3274 if (root->ref_cows || root == root->fs_info->tree_root)
3271 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3275 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3272 3276
3273 /* 3277 /*
3274 * This function is also used to drop the items in the log tree before 3278 * This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
3429 3433
3430 if (path->slots[0] == 0 || 3434 if (path->slots[0] == 0 ||
3431 path->slots[0] != pending_del_slot) { 3435 path->slots[0] != pending_del_slot) {
3432 if (root->ref_cows &&
3433 BTRFS_I(inode)->location.objectid !=
3434 BTRFS_FREE_INO_OBJECTID) {
3435 err = -EAGAIN;
3436 goto out;
3437 }
3438 if (pending_del_nr) { 3436 if (pending_del_nr) {
3439 ret = btrfs_del_items(trans, root, path, 3437 ret = btrfs_del_items(trans, root, path,
3440 pending_del_slot, 3438 pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
3465} 3463}
3466 3464
3467/* 3465/*
3468 * taken from block_truncate_page, but does cow as it zeros out 3466 * btrfs_truncate_page - read, zero a chunk and write a page
3469 * any bytes left in the last page in the file. 3467 * @inode - inode that we're zeroing
3468 * @from - the offset to start zeroing
3469 * @len - the length to zero, 0 to zero the entire range respective to the
3470 * offset
3471 * @front - zero up to the offset instead of from the offset on
3472 *
3473 * This will find the page for the "from" offset and cow the page and zero the
3474 * part we want to zero. This is used with truncate and hole punching.
3470 */ 3475 */
3471static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3476int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3477 int front)
3472{ 3478{
3473 struct inode *inode = mapping->host; 3479 struct address_space *mapping = inode->i_mapping;
3474 struct btrfs_root *root = BTRFS_I(inode)->root; 3480 struct btrfs_root *root = BTRFS_I(inode)->root;
3475 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3476 struct btrfs_ordered_extent *ordered; 3482 struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3485 u64 page_start; 3491 u64 page_start;
3486 u64 page_end; 3492 u64 page_end;
3487 3493
3488 if ((offset & (blocksize - 1)) == 0) 3494 if ((offset & (blocksize - 1)) == 0 &&
3495 (!len || ((len & (blocksize - 1)) == 0)))
3489 goto out; 3496 goto out;
3490 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3497 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3491 if (ret) 3498 if (ret)
@@ -3532,7 +3539,8 @@ again:
3532 } 3539 }
3533 3540
3534 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3541 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3535 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3542 EXTENT_DIRTY | EXTENT_DELALLOC |
3543 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
3536 0, 0, &cached_state, GFP_NOFS); 3544 0, 0, &cached_state, GFP_NOFS);
3537 3545
3538 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3546 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
3545 3553
3546 ret = 0; 3554 ret = 0;
3547 if (offset != PAGE_CACHE_SIZE) { 3555 if (offset != PAGE_CACHE_SIZE) {
3556 if (!len)
3557 len = PAGE_CACHE_SIZE - offset;
3548 kaddr = kmap(page); 3558 kaddr = kmap(page);
3549 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3559 if (front)
3560 memset(kaddr, 0, offset);
3561 else
3562 memset(kaddr + offset, 0, len);
3550 flush_dcache_page(page); 3563 flush_dcache_page(page);
3551 kunmap(page); 3564 kunmap(page);
3552 } 3565 }
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3577 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3590 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3578 struct extent_map *em = NULL; 3591 struct extent_map *em = NULL;
3579 struct extent_state *cached_state = NULL; 3592 struct extent_state *cached_state = NULL;
3593 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3580 u64 mask = root->sectorsize - 1; 3594 u64 mask = root->sectorsize - 1;
3581 u64 hole_start = (oldsize + mask) & ~mask; 3595 u64 hole_start = (oldsize + mask) & ~mask;
3582 u64 block_end = (size + mask) & ~mask; 3596 u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3613 last_byte = min(extent_map_end(em), block_end); 3627 last_byte = min(extent_map_end(em), block_end);
3614 last_byte = (last_byte + mask) & ~mask; 3628 last_byte = (last_byte + mask) & ~mask;
3615 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3629 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3616 u64 hint_byte = 0; 3630 struct extent_map *hole_em;
3617 hole_size = last_byte - cur_offset; 3631 hole_size = last_byte - cur_offset;
3618 3632
3619 trans = btrfs_start_transaction(root, 3); 3633 trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3622 break; 3636 break;
3623 } 3637 }
3624 3638
3625 err = btrfs_drop_extents(trans, inode, cur_offset, 3639 err = btrfs_drop_extents(trans, root, inode,
3626 cur_offset + hole_size, 3640 cur_offset,
3627 &hint_byte, 1); 3641 cur_offset + hole_size, 1);
3628 if (err) { 3642 if (err) {
3629 btrfs_abort_transaction(trans, root, err); 3643 btrfs_abort_transaction(trans, root, err);
3630 btrfs_end_transaction(trans, root); 3644 btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3641 break; 3655 break;
3642 } 3656 }
3643 3657
3644 btrfs_drop_extent_cache(inode, hole_start, 3658 btrfs_drop_extent_cache(inode, cur_offset,
3645 last_byte - 1, 0); 3659 cur_offset + hole_size - 1, 0);
3660 hole_em = alloc_extent_map();
3661 if (!hole_em) {
3662 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3663 &BTRFS_I(inode)->runtime_flags);
3664 goto next;
3665 }
3666 hole_em->start = cur_offset;
3667 hole_em->len = hole_size;
3668 hole_em->orig_start = cur_offset;
3669
3670 hole_em->block_start = EXTENT_MAP_HOLE;
3671 hole_em->block_len = 0;
3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3673 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3674 hole_em->generation = trans->transid;
3646 3675
3676 while (1) {
3677 write_lock(&em_tree->lock);
3678 err = add_extent_mapping(em_tree, hole_em);
3679 if (!err)
3680 list_move(&hole_em->list,
3681 &em_tree->modified_extents);
3682 write_unlock(&em_tree->lock);
3683 if (err != -EEXIST)
3684 break;
3685 btrfs_drop_extent_cache(inode, cur_offset,
3686 cur_offset +
3687 hole_size - 1, 0);
3688 }
3689 free_extent_map(hole_em);
3690next:
3647 btrfs_update_inode(trans, root, inode); 3691 btrfs_update_inode(trans, root, inode);
3648 btrfs_end_transaction(trans, root); 3692 btrfs_end_transaction(trans, root);
3649 } 3693 }
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
3768 goto no_delete; 3812 goto no_delete;
3769 } 3813 }
3770 3814
3771 rsv = btrfs_alloc_block_rsv(root); 3815 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3772 if (!rsv) { 3816 if (!rsv) {
3773 btrfs_orphan_del(NULL, inode); 3817 btrfs_orphan_del(NULL, inode);
3774 goto no_delete; 3818 goto no_delete;
3775 } 3819 }
3776 rsv->size = min_size; 3820 rsv->size = min_size;
3821 rsv->failfast = 1;
3777 global_rsv = &root->fs_info->global_block_rsv; 3822 global_rsv = &root->fs_info->global_block_rsv;
3778 3823
3779 btrfs_i_size_write(inode, 0); 3824 btrfs_i_size_write(inode, 0);
3780 3825
3781 /* 3826 /*
3782 * This is a bit simpler than btrfs_truncate since 3827 * This is a bit simpler than btrfs_truncate since we've already
3783 * 3828 * reserved our space for our orphan item in the unlink, so we just
3784 * 1) We've already reserved our space for our orphan item in the 3829 * need to reserve some slack space in case we add bytes and update
3785 * unlink. 3830 * inode item when doing the truncate.
3786 * 2) We're going to delete the inode item, so we don't need to update
3787 * it at all.
3788 *
3789 * So we just need to reserve some slack space in case we add bytes when
3790 * doing the truncate.
3791 */ 3831 */
3792 while (1) { 3832 while (1) {
3793 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3833 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
3808 goto no_delete; 3848 goto no_delete;
3809 } 3849 }
3810 3850
3811 trans = btrfs_start_transaction(root, 0); 3851 trans = btrfs_start_transaction_noflush(root, 1);
3812 if (IS_ERR(trans)) { 3852 if (IS_ERR(trans)) {
3813 btrfs_orphan_del(NULL, inode); 3853 btrfs_orphan_del(NULL, inode);
3814 btrfs_free_block_rsv(root, rsv); 3854 btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
3818 trans->block_rsv = rsv; 3858 trans->block_rsv = rsv;
3819 3859
3820 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3860 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3821 if (ret != -EAGAIN) 3861 if (ret != -ENOSPC)
3822 break; 3862 break;
3823 3863
3864 trans->block_rsv = &root->fs_info->trans_block_rsv;
3865 ret = btrfs_update_inode(trans, root, inode);
3866 BUG_ON(ret);
3867
3824 nr = trans->blocks_used; 3868 nr = trans->blocks_used;
3825 btrfs_end_transaction(trans, root); 3869 btrfs_end_transaction(trans, root);
3826 trans = NULL; 3870 trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4470 trans = btrfs_join_transaction(root); 4514 trans = btrfs_join_transaction(root);
4471 if (IS_ERR(trans)) 4515 if (IS_ERR(trans))
4472 return PTR_ERR(trans); 4516 return PTR_ERR(trans);
4473 if (nolock) 4517 ret = btrfs_commit_transaction(trans, root);
4474 ret = btrfs_end_transaction_nolock(trans, root);
4475 else
4476 ret = btrfs_commit_transaction(trans, root);
4477 } 4518 }
4478 return ret; 4519 return ret;
4479} 4520}
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4671 BTRFS_I(inode)->generation = trans->transid; 4712 BTRFS_I(inode)->generation = trans->transid;
4672 inode->i_generation = BTRFS_I(inode)->generation; 4713 inode->i_generation = BTRFS_I(inode)->generation;
4673 4714
4715 /*
4716 * We could have gotten an inode number from somebody who was fsynced
4717 * and then removed in this same transaction, so let's just set full
4718 * sync since it will be a full sync anyway and this will blow away the
4719 * old info in the log.
4720 */
4721 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4722
4674 if (S_ISDIR(mode)) 4723 if (S_ISDIR(mode))
4675 owner = 0; 4724 owner = 0;
4676 else 4725 else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4680 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4729 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4681 key[0].offset = 0; 4730 key[0].offset = 0;
4682 4731
4732 /*
4733 * Start new inodes with an inode_ref. This is slightly more
4734 * efficient for small numbers of hard links since they will
4735 * be packed into one item. Extended refs will kick in if we
4736 * add more hard links than can fit in the ref item.
4737 */
4683 key[1].objectid = objectid; 4738 key[1].objectid = objectid;
4684 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4739 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4685 key[1].offset = ref_objectid; 4740 key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4986 if (root->objectid != BTRFS_I(inode)->root->objectid) 5041 if (root->objectid != BTRFS_I(inode)->root->objectid)
4987 return -EXDEV; 5042 return -EXDEV;
4988 5043
4989 if (inode->i_nlink == ~0U) 5044 if (inode->i_nlink >= BTRFS_LINK_MAX)
4990 return -EMLINK; 5045 return -EMLINK;
4991 5046
4992 err = btrfs_set_inode_index(dir, &index); 5047 err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
5450 write_unlock(&em_tree->lock); 5505 write_unlock(&em_tree->lock);
5451out: 5506out:
5452 5507
5453 trace_btrfs_get_extent(root, em); 5508 if (em)
5509 trace_btrfs_get_extent(root, em);
5454 5510
5455 if (path) 5511 if (path)
5456 btrfs_free_path(path); 5512 btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5836 return ret; 5892 return ret;
5837} 5893}
5838 5894
5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5896 u64 len, u64 orig_start,
5897 u64 block_start, u64 block_len,
5898 int type)
5899{
5900 struct extent_map_tree *em_tree;
5901 struct extent_map *em;
5902 struct btrfs_root *root = BTRFS_I(inode)->root;
5903 int ret;
5904
5905 em_tree = &BTRFS_I(inode)->extent_tree;
5906 em = alloc_extent_map();
5907 if (!em)
5908 return ERR_PTR(-ENOMEM);
5909
5910 em->start = start;
5911 em->orig_start = orig_start;
5912 em->len = len;
5913 em->block_len = block_len;
5914 em->block_start = block_start;
5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5917 if (type == BTRFS_ORDERED_PREALLOC)
5918 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5919
5920 do {
5921 btrfs_drop_extent_cache(inode, em->start,
5922 em->start + em->len - 1, 0);
5923 write_lock(&em_tree->lock);
5924 ret = add_extent_mapping(em_tree, em);
5925 write_unlock(&em_tree->lock);
5926 } while (ret == -EEXIST);
5927
5928 if (ret) {
5929 free_extent_map(em);
5930 return ERR_PTR(ret);
5931 }
5932
5933 return em;
5934}
5935
5936
5839static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5937static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5840 struct buffer_head *bh_result, int create) 5938 struct buffer_head *bh_result, int create)
5841{ 5939{
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5950 goto must_cow; 6048 goto must_cow;
5951 6049
5952 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6050 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6051 u64 orig_start = em->start;
6052
6053 if (type == BTRFS_ORDERED_PREALLOC) {
6054 free_extent_map(em);
6055 em = create_pinned_em(inode, start, len,
6056 orig_start,
6057 block_start, len, type);
6058 if (IS_ERR(em)) {
6059 btrfs_end_transaction(trans, root);
6060 goto unlock_err;
6061 }
6062 }
6063
5953 ret = btrfs_add_ordered_extent_dio(inode, start, 6064 ret = btrfs_add_ordered_extent_dio(inode, start,
5954 block_start, len, len, type); 6065 block_start, len, len, type);
5955 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
5999 if (lockstart < lockend) { 6110 if (lockstart < lockend) {
6000 if (create && len < lockend - lockstart) { 6111 if (create && len < lockend - lockstart) {
6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6112 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6002 lockstart + len - 1, unlock_bits, 1, 0, 6113 lockstart + len - 1,
6114 unlock_bits | EXTENT_DEFRAG, 1, 0,
6003 &cached_state, GFP_NOFS); 6115 &cached_state, GFP_NOFS);
6004 /* 6116 /*
6005 * Beside unlock, we also need to cleanup reserved space 6117 * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
6007 */ 6119 */
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6120 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6009 lockstart + len, lockend, 6121 lockstart + len, lockend,
6010 unlock_bits | EXTENT_DO_ACCOUNTING, 6122 unlock_bits | EXTENT_DO_ACCOUNTING |
6011 1, 0, NULL, GFP_NOFS); 6123 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6012 } else { 6124 } else {
6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6125 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6014 lockend, unlock_bits, 1, 0, 6126 lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6573 */ 6685 */
6574 clear_extent_bit(tree, page_start, page_end, 6686 clear_extent_bit(tree, page_start, page_end,
6575 EXTENT_DIRTY | EXTENT_DELALLOC | 6687 EXTENT_DIRTY | EXTENT_DELALLOC |
6576 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6688 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
6577 &cached_state, GFP_NOFS); 6689 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
6578 /* 6690 /*
6579 * whoever cleared the private bit is responsible 6691 * whoever cleared the private bit is responsible
6580 * for the finish_ordered_io 6692 * for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6590 } 6702 }
6591 clear_extent_bit(tree, page_start, page_end, 6703 clear_extent_bit(tree, page_start, page_end,
6592 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6704 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6593 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6705 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
6706 &cached_state, GFP_NOFS);
6594 __btrfs_releasepage(page, GFP_NOFS); 6707 __btrfs_releasepage(page, GFP_NOFS);
6595 6708
6596 ClearPageChecked(page); 6709 ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
6687 * prepare_pages in the normal write path. 6800 * prepare_pages in the normal write path.
6688 */ 6801 */
6689 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6802 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6690 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6803 EXTENT_DIRTY | EXTENT_DELALLOC |
6804 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
6691 0, 0, &cached_state, GFP_NOFS); 6805 0, 0, &cached_state, GFP_NOFS);
6692 6806
6693 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6807 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
6718 6832
6719 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6833 BTRFS_I(inode)->last_trans = root->fs_info->generation;
6720 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6834 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6835 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
6721 6836
6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6837 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6723 6838
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
6745 u64 mask = root->sectorsize - 1; 6860 u64 mask = root->sectorsize - 1;
6746 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6861 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6747 6862
6748 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6863 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
6749 if (ret) 6864 if (ret)
6750 return ret; 6865 return ret;
6751 6866
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
6788 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6903 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6789 * updating the inode. 6904 * updating the inode.
6790 */ 6905 */
6791 rsv = btrfs_alloc_block_rsv(root); 6906 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
6792 if (!rsv) 6907 if (!rsv)
6793 return -ENOMEM; 6908 return -ENOMEM;
6794 rsv->size = min_size; 6909 rsv->size = min_size;
6910 rsv->failfast = 1;
6795 6911
6796 /* 6912 /*
6797 * 1 for the truncate slack space 6913 * 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
6837 &BTRFS_I(inode)->runtime_flags)) 6953 &BTRFS_I(inode)->runtime_flags))
6838 btrfs_add_ordered_operation(trans, root, inode); 6954 btrfs_add_ordered_operation(trans, root, inode);
6839 6955
6840 while (1) { 6956 /*
6841 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6957 * So if we truncate and then write and fsync we normally would just
6842 if (ret) { 6958 * write the extents that changed, which is a problem if we need to
6843 /* 6959 * first truncate that entire inode. So set this flag so we write out
6844 * This can only happen with the original transaction we 6960 * all of the extents in the inode to the sync log so we're completely
6845 * started above, every other time we shouldn't have a 6961 * safe.
6846 * transaction started yet. 6962 */
6847 */ 6963 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6848 if (ret == -EAGAIN) 6964 trans->block_rsv = rsv;
6849 goto end_trans;
6850 err = ret;
6851 break;
6852 }
6853
6854 if (!trans) {
6855 /* Just need the 1 for updating the inode */
6856 trans = btrfs_start_transaction(root, 1);
6857 if (IS_ERR(trans)) {
6858 ret = err = PTR_ERR(trans);
6859 trans = NULL;
6860 break;
6861 }
6862 }
6863
6864 trans->block_rsv = rsv;
6865 6965
6966 while (1) {
6866 ret = btrfs_truncate_inode_items(trans, root, inode, 6967 ret = btrfs_truncate_inode_items(trans, root, inode,
6867 inode->i_size, 6968 inode->i_size,
6868 BTRFS_EXTENT_DATA_KEY); 6969 BTRFS_EXTENT_DATA_KEY);
6869 if (ret != -EAGAIN) { 6970 if (ret != -ENOSPC) {
6870 err = ret; 6971 err = ret;
6871 break; 6972 break;
6872 } 6973 }
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
6877 err = ret; 6978 err = ret;
6878 break; 6979 break;
6879 } 6980 }
6880end_trans: 6981
6881 nr = trans->blocks_used; 6982 nr = trans->blocks_used;
6882 btrfs_end_transaction(trans, root); 6983 btrfs_end_transaction(trans, root);
6883 trans = NULL;
6884 btrfs_btree_balance_dirty(root, nr); 6984 btrfs_btree_balance_dirty(root, nr);
6985
6986 trans = btrfs_start_transaction(root, 2);
6987 if (IS_ERR(trans)) {
6988 ret = err = PTR_ERR(trans);
6989 trans = NULL;
6990 break;
6991 }
6992
6993 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
6994 rsv, min_size);
6995 BUG_ON(ret); /* shouldn't happen */
6996 trans->block_rsv = rsv;
6885 } 6997 }
6886 6998
6887 if (ret == 0 && inode->i_nlink > 0) { 6999 if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6965 ei->csum_bytes = 0; 7077 ei->csum_bytes = 0;
6966 ei->index_cnt = (u64)-1; 7078 ei->index_cnt = (u64)-1;
6967 ei->last_unlink_trans = 0; 7079 ei->last_unlink_trans = 0;
7080 ei->last_log_commit = 0;
6968 7081
6969 spin_lock_init(&ei->lock); 7082 spin_lock_init(&ei->lock);
6970 ei->outstanding_extents = 0; 7083 ei->outstanding_extents = 0;
@@ -7076,6 +7189,11 @@ static void init_once(void *foo)
7076 7189
7077void btrfs_destroy_cachep(void) 7190void btrfs_destroy_cachep(void)
7078{ 7191{
7192 /*
7193 * Make sure all delayed rcu free inodes are flushed before we
7194 * destroy cache.
7195 */
7196 rcu_barrier();
7079 if (btrfs_inode_cachep) 7197 if (btrfs_inode_cachep)
7080 kmem_cache_destroy(btrfs_inode_cachep); 7198 kmem_cache_destroy(btrfs_inode_cachep);
7081 if (btrfs_trans_handle_cachep) 7199 if (btrfs_trans_handle_cachep)
@@ -7090,31 +7208,31 @@ void btrfs_destroy_cachep(void)
7090 7208
7091int btrfs_init_cachep(void) 7209int btrfs_init_cachep(void)
7092{ 7210{
7093 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7211 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7094 sizeof(struct btrfs_inode), 0, 7212 sizeof(struct btrfs_inode), 0,
7095 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7213 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7096 if (!btrfs_inode_cachep) 7214 if (!btrfs_inode_cachep)
7097 goto fail; 7215 goto fail;
7098 7216
7099 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7217 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7100 sizeof(struct btrfs_trans_handle), 0, 7218 sizeof(struct btrfs_trans_handle), 0,
7101 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7219 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7102 if (!btrfs_trans_handle_cachep) 7220 if (!btrfs_trans_handle_cachep)
7103 goto fail; 7221 goto fail;
7104 7222
7105 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7223 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7106 sizeof(struct btrfs_transaction), 0, 7224 sizeof(struct btrfs_transaction), 0,
7107 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7225 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7108 if (!btrfs_transaction_cachep) 7226 if (!btrfs_transaction_cachep)
7109 goto fail; 7227 goto fail;
7110 7228
7111 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7229 btrfs_path_cachep = kmem_cache_create("btrfs_path",
7112 sizeof(struct btrfs_path), 0, 7230 sizeof(struct btrfs_path), 0,
7113 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7231 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7114 if (!btrfs_path_cachep) 7232 if (!btrfs_path_cachep)
7115 goto fail; 7233 goto fail;
7116 7234
7117 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7235 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7118 sizeof(struct btrfs_free_space), 0, 7236 sizeof(struct btrfs_free_space), 0,
7119 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7237 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7120 if (!btrfs_free_space_cachep) 7238 if (!btrfs_free_space_cachep)
@@ -7508,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7508 loff_t actual_len, u64 *alloc_hint, 7626 loff_t actual_len, u64 *alloc_hint,
7509 struct btrfs_trans_handle *trans) 7627 struct btrfs_trans_handle *trans)
7510{ 7628{
7629 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7630 struct extent_map *em;
7511 struct btrfs_root *root = BTRFS_I(inode)->root; 7631 struct btrfs_root *root = BTRFS_I(inode)->root;
7512 struct btrfs_key ins; 7632 struct btrfs_key ins;
7513 u64 cur_offset = start; 7633 u64 cur_offset = start;
@@ -7548,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7548 btrfs_drop_extent_cache(inode, cur_offset, 7668 btrfs_drop_extent_cache(inode, cur_offset,
7549 cur_offset + ins.offset -1, 0); 7669 cur_offset + ins.offset -1, 0);
7550 7670
7671 em = alloc_extent_map();
7672 if (!em) {
7673 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7674 &BTRFS_I(inode)->runtime_flags);
7675 goto next;
7676 }
7677
7678 em->start = cur_offset;
7679 em->orig_start = cur_offset;
7680 em->len = ins.offset;
7681 em->block_start = ins.objectid;
7682 em->block_len = ins.offset;
7683 em->bdev = root->fs_info->fs_devices->latest_bdev;
7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7685 em->generation = trans->transid;
7686
7687 while (1) {
7688 write_lock(&em_tree->lock);
7689 ret = add_extent_mapping(em_tree, em);
7690 if (!ret)
7691 list_move(&em->list,
7692 &em_tree->modified_extents);
7693 write_unlock(&em_tree->lock);
7694 if (ret != -EEXIST)
7695 break;
7696 btrfs_drop_extent_cache(inode, cur_offset,
7697 cur_offset + ins.offset - 1,
7698 0);
7699 }
7700 free_extent_map(em);
7701next:
7551 num_bytes -= ins.offset; 7702 num_bytes -= ins.offset;
7552 cur_offset += ins.offset; 7703 cur_offset += ins.offset;
7553 *alloc_hint = ins.objectid + ins.offset; 7704 *alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa8a078..61168805f175 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
181 int ret; 181 int ret;
182 u64 ip_oldflags; 182 u64 ip_oldflags;
183 unsigned int i_oldflags; 183 unsigned int i_oldflags;
184 umode_t mode;
184 185
185 if (btrfs_root_readonly(root)) 186 if (btrfs_root_readonly(root))
186 return -EROFS; 187 return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
203 204
204 ip_oldflags = ip->flags; 205 ip_oldflags = ip->flags;
205 i_oldflags = inode->i_flags; 206 i_oldflags = inode->i_flags;
207 mode = inode->i_mode;
206 208
207 flags = btrfs_mask_flags(inode->i_mode, flags); 209 flags = btrfs_mask_flags(inode->i_mode, flags);
208 oldflags = btrfs_flags_to_ioctl(ip->flags); 210 oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
237 ip->flags |= BTRFS_INODE_DIRSYNC; 239 ip->flags |= BTRFS_INODE_DIRSYNC;
238 else 240 else
239 ip->flags &= ~BTRFS_INODE_DIRSYNC; 241 ip->flags &= ~BTRFS_INODE_DIRSYNC;
240 if (flags & FS_NOCOW_FL) 242 if (flags & FS_NOCOW_FL) {
241 ip->flags |= BTRFS_INODE_NODATACOW; 243 if (S_ISREG(mode)) {
242 else 244 /*
243 ip->flags &= ~BTRFS_INODE_NODATACOW; 245 * It's safe to turn csums off here, no extents exist.
246 * Otherwise we want the flag to reflect the real COW
247 * status of the file and will not set it.
248 */
249 if (inode->i_size == 0)
250 ip->flags |= BTRFS_INODE_NODATACOW
251 | BTRFS_INODE_NODATASUM;
252 } else {
253 ip->flags |= BTRFS_INODE_NODATACOW;
254 }
255 } else {
256 /*
257 * Revert back under same assuptions as above
258 */
259 if (S_ISREG(mode)) {
260 if (inode->i_size == 0)
261 ip->flags &= ~(BTRFS_INODE_NODATACOW
262 | BTRFS_INODE_NODATASUM);
263 } else {
264 ip->flags &= ~BTRFS_INODE_NODATACOW;
265 }
266 }
244 267
245 /* 268 /*
246 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 269 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
516 if (!pending_snapshot) 539 if (!pending_snapshot)
517 return -ENOMEM; 540 return -ENOMEM;
518 541
519 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 542 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
543 BTRFS_BLOCK_RSV_TEMP);
520 pending_snapshot->dentry = dentry; 544 pending_snapshot->dentry = dentry;
521 pending_snapshot->root = root; 545 pending_snapshot->root = root;
522 pending_snapshot->readonly = readonly; 546 pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
525 *inherit = NULL; /* take responsibility to free it */ 549 *inherit = NULL; /* take responsibility to free it */
526 } 550 }
527 551
528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 552 trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
529 if (IS_ERR(trans)) { 553 if (IS_ERR(trans)) {
530 ret = PTR_ERR(trans); 554 ret = PTR_ERR(trans);
531 goto fail; 555 goto fail;
@@ -575,13 +599,13 @@ fail:
575*/ 599*/
576static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) 600static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
577{ 601{
578 uid_t fsuid = current_fsuid(); 602 kuid_t fsuid = current_fsuid();
579 603
580 if (!(dir->i_mode & S_ISVTX)) 604 if (!(dir->i_mode & S_ISVTX))
581 return 0; 605 return 0;
582 if (inode->i_uid == fsuid) 606 if (uid_eq(inode->i_uid, fsuid))
583 return 0; 607 return 0;
584 if (dir->i_uid == fsuid) 608 if (uid_eq(dir->i_uid, fsuid))
585 return 0; 609 return 0;
586 return !capable(CAP_FOWNER); 610 return !capable(CAP_FOWNER);
587} 611}
@@ -614,7 +638,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
614 return -ENOENT; 638 return -ENOENT;
615 639
616 BUG_ON(victim->d_parent->d_inode != dir); 640 BUG_ON(victim->d_parent->d_inode != dir);
617 audit_inode_child(victim, dir); 641 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
618 642
619 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 643 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
620 if (error) 644 if (error)
@@ -1022,8 +1046,8 @@ again:
1022 page_start, page_end - 1, 0, &cached_state); 1046 page_start, page_end - 1, 0, &cached_state);
1023 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1047 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1024 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1048 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1025 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1049 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1026 GFP_NOFS); 1050 &cached_state, GFP_NOFS);
1027 1051
1028 if (i_done != page_cnt) { 1052 if (i_done != page_cnt) {
1029 spin_lock(&BTRFS_I(inode)->lock); 1053 spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
1034 } 1058 }
1035 1059
1036 1060
1037 btrfs_set_extent_delalloc(inode, page_start, page_end - 1, 1061 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
1038 &cached_state); 1062 &cached_state, GFP_NOFS);
1039 1063
1040 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1064 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1041 page_start, page_end - 1, &cached_state, 1065 page_start, page_end - 1, &cached_state,
@@ -1397,7 +1421,6 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1397 u64 *transid, bool readonly, 1421 u64 *transid, bool readonly,
1398 struct btrfs_qgroup_inherit **inherit) 1422 struct btrfs_qgroup_inherit **inherit)
1399{ 1423{
1400 struct file *src_file;
1401 int namelen; 1424 int namelen;
1402 int ret = 0; 1425 int ret = 0;
1403 1426
@@ -1421,25 +1444,24 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1421 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1444 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1422 NULL, transid, readonly, inherit); 1445 NULL, transid, readonly, inherit);
1423 } else { 1446 } else {
1447 struct fd src = fdget(fd);
1424 struct inode *src_inode; 1448 struct inode *src_inode;
1425 src_file = fget(fd); 1449 if (!src.file) {
1426 if (!src_file) {
1427 ret = -EINVAL; 1450 ret = -EINVAL;
1428 goto out_drop_write; 1451 goto out_drop_write;
1429 } 1452 }
1430 1453
1431 src_inode = src_file->f_path.dentry->d_inode; 1454 src_inode = src.file->f_path.dentry->d_inode;
1432 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { 1455 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
1433 printk(KERN_INFO "btrfs: Snapshot src from " 1456 printk(KERN_INFO "btrfs: Snapshot src from "
1434 "another FS\n"); 1457 "another FS\n");
1435 ret = -EINVAL; 1458 ret = -EINVAL;
1436 fput(src_file); 1459 } else {
1437 goto out_drop_write; 1460 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1461 BTRFS_I(src_inode)->root,
1462 transid, readonly, inherit);
1438 } 1463 }
1439 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1464 fdput(src);
1440 BTRFS_I(src_inode)->root,
1441 transid, readonly, inherit);
1442 fput(src_file);
1443 } 1465 }
1444out_drop_write: 1466out_drop_write:
1445 mnt_drop_write_file(file); 1467 mnt_drop_write_file(file);
@@ -2341,7 +2363,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2341{ 2363{
2342 struct inode *inode = fdentry(file)->d_inode; 2364 struct inode *inode = fdentry(file)->d_inode;
2343 struct btrfs_root *root = BTRFS_I(inode)->root; 2365 struct btrfs_root *root = BTRFS_I(inode)->root;
2344 struct file *src_file; 2366 struct fd src_file;
2345 struct inode *src; 2367 struct inode *src;
2346 struct btrfs_trans_handle *trans; 2368 struct btrfs_trans_handle *trans;
2347 struct btrfs_path *path; 2369 struct btrfs_path *path;
@@ -2353,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2353 int ret; 2375 int ret;
2354 u64 len = olen; 2376 u64 len = olen;
2355 u64 bs = root->fs_info->sb->s_blocksize; 2377 u64 bs = root->fs_info->sb->s_blocksize;
2356 u64 hint_byte;
2357 2378
2358 /* 2379 /*
2359 * TODO: 2380 * TODO:
@@ -2376,24 +2397,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2376 if (ret) 2397 if (ret)
2377 return ret; 2398 return ret;
2378 2399
2379 src_file = fget(srcfd); 2400 src_file = fdget(srcfd);
2380 if (!src_file) { 2401 if (!src_file.file) {
2381 ret = -EBADF; 2402 ret = -EBADF;
2382 goto out_drop_write; 2403 goto out_drop_write;
2383 } 2404 }
2384 2405
2385 ret = -EXDEV; 2406 ret = -EXDEV;
2386 if (src_file->f_path.mnt != file->f_path.mnt) 2407 if (src_file.file->f_path.mnt != file->f_path.mnt)
2387 goto out_fput; 2408 goto out_fput;
2388 2409
2389 src = src_file->f_dentry->d_inode; 2410 src = src_file.file->f_dentry->d_inode;
2390 2411
2391 ret = -EINVAL; 2412 ret = -EINVAL;
2392 if (src == inode) 2413 if (src == inode)
2393 goto out_fput; 2414 goto out_fput;
2394 2415
2395 /* the src must be open for reading */ 2416 /* the src must be open for reading */
2396 if (!(src_file->f_mode & FMODE_READ)) 2417 if (!(src_file.file->f_mode & FMODE_READ))
2397 goto out_fput; 2418 goto out_fput;
2398 2419
2399 /* don't make the dst file partly checksummed */ 2420 /* don't make the dst file partly checksummed */
@@ -2458,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2458 another, and lock file content */ 2479 another, and lock file content */
2459 while (1) { 2480 while (1) {
2460 struct btrfs_ordered_extent *ordered; 2481 struct btrfs_ordered_extent *ordered;
2461 lock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2482 lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2462 ordered = btrfs_lookup_first_ordered_extent(src, off+len); 2483 ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
2463 if (!ordered && 2484 if (!ordered &&
2464 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, 2485 !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
2465 EXTENT_DELALLOC, 0, NULL)) 2486 EXTENT_DELALLOC, 0, NULL))
2466 break; 2487 break;
2467 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2488 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2468 if (ordered) 2489 if (ordered)
2469 btrfs_put_ordered_extent(ordered); 2490 btrfs_put_ordered_extent(ordered);
2470 btrfs_wait_ordered_range(src, off, len); 2491 btrfs_wait_ordered_range(src, off, len);
@@ -2538,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2538 btrfs_release_path(path); 2559 btrfs_release_path(path);
2539 2560
2540 if (key.offset + datal <= off || 2561 if (key.offset + datal <= off ||
2541 key.offset >= off+len) 2562 key.offset >= off + len - 1)
2542 goto next; 2563 goto next;
2543 2564
2544 memcpy(&new_key, &key, sizeof(new_key)); 2565 memcpy(&new_key, &key, sizeof(new_key));
@@ -2576,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2576 datal -= off - key.offset; 2597 datal -= off - key.offset;
2577 } 2598 }
2578 2599
2579 ret = btrfs_drop_extents(trans, inode, 2600 ret = btrfs_drop_extents(trans, root, inode,
2580 new_key.offset, 2601 new_key.offset,
2581 new_key.offset + datal, 2602 new_key.offset + datal,
2582 &hint_byte, 1); 2603 1);
2583 if (ret) { 2604 if (ret) {
2584 btrfs_abort_transaction(trans, root, 2605 btrfs_abort_transaction(trans, root,
2585 ret); 2606 ret);
@@ -2639,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2639 new_key.offset += skip; 2660 new_key.offset += skip;
2640 } 2661 }
2641 2662
2642 if (key.offset + datal > off+len) 2663 if (key.offset + datal > off + len)
2643 trim = key.offset + datal - (off+len); 2664 trim = key.offset + datal - (off + len);
2644 2665
2645 if (comp && (skip || trim)) { 2666 if (comp && (skip || trim)) {
2646 ret = -EINVAL; 2667 ret = -EINVAL;
@@ -2650,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2650 size -= skip + trim; 2671 size -= skip + trim;
2651 datal -= skip + trim; 2672 datal -= skip + trim;
2652 2673
2653 ret = btrfs_drop_extents(trans, inode, 2674 ret = btrfs_drop_extents(trans, root, inode,
2654 new_key.offset, 2675 new_key.offset,
2655 new_key.offset + datal, 2676 new_key.offset + datal,
2656 &hint_byte, 1); 2677 1);
2657 if (ret) { 2678 if (ret) {
2658 btrfs_abort_transaction(trans, root, 2679 btrfs_abort_transaction(trans, root,
2659 ret); 2680 ret);
@@ -2717,14 +2738,14 @@ next:
2717 ret = 0; 2738 ret = 0;
2718out: 2739out:
2719 btrfs_release_path(path); 2740 btrfs_release_path(path);
2720 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2741 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2721out_unlock: 2742out_unlock:
2722 mutex_unlock(&src->i_mutex); 2743 mutex_unlock(&src->i_mutex);
2723 mutex_unlock(&inode->i_mutex); 2744 mutex_unlock(&inode->i_mutex);
2724 vfree(buf); 2745 vfree(buf);
2725 btrfs_free_path(path); 2746 btrfs_free_path(path);
2726out_fput: 2747out_fput:
2727 fput(src_file); 2748 fdput(src_file);
2728out_drop_write: 2749out_drop_write:
2729 mnt_drop_write_file(file); 2750 mnt_drop_write_file(file);
2730 return ret; 2751 return ret;
@@ -2852,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2852 return 0; 2873 return 0;
2853} 2874}
2854 2875
2855static void get_block_group_info(struct list_head *groups_list, 2876void btrfs_get_block_group_info(struct list_head *groups_list,
2856 struct btrfs_ioctl_space_info *space) 2877 struct btrfs_ioctl_space_info *space)
2857{ 2878{
2858 struct btrfs_block_group_cache *block_group; 2879 struct btrfs_block_group_cache *block_group;
2859 2880
@@ -2961,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2961 down_read(&info->groups_sem); 2982 down_read(&info->groups_sem);
2962 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 2983 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2963 if (!list_empty(&info->block_groups[c])) { 2984 if (!list_empty(&info->block_groups[c])) {
2964 get_block_group_info(&info->block_groups[c], 2985 btrfs_get_block_group_info(
2965 &space); 2986 &info->block_groups[c], &space);
2966 memcpy(dest, &space, sizeof(space)); 2987 memcpy(dest, &space, sizeof(space));
2967 dest++; 2988 dest++;
2968 space_args.total_spaces++; 2989 space_args.total_spaces++;
@@ -3210,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3210{ 3231{
3211 int ret = 0; 3232 int ret = 0;
3212 int size; 3233 int size;
3213 u64 extent_item_pos;
3214 struct btrfs_ioctl_logical_ino_args *loi; 3234 struct btrfs_ioctl_logical_ino_args *loi;
3215 struct btrfs_data_container *inodes = NULL; 3235 struct btrfs_data_container *inodes = NULL;
3216 struct btrfs_path *path = NULL; 3236 struct btrfs_path *path = NULL;
3217 struct btrfs_key key;
3218 3237
3219 if (!capable(CAP_SYS_ADMIN)) 3238 if (!capable(CAP_SYS_ADMIN))
3220 return -EPERM; 3239 return -EPERM;
@@ -3232,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3232 goto out; 3251 goto out;
3233 } 3252 }
3234 3253
3235 size = min_t(u32, loi->size, 4096); 3254 size = min_t(u32, loi->size, 64 * 1024);
3236 inodes = init_data_container(size); 3255 inodes = init_data_container(size);
3237 if (IS_ERR(inodes)) { 3256 if (IS_ERR(inodes)) {
3238 ret = PTR_ERR(inodes); 3257 ret = PTR_ERR(inodes);
@@ -3240,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3240 goto out; 3259 goto out;
3241 } 3260 }
3242 3261
3243 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3262 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
3244 btrfs_release_path(path); 3263 build_ino_list, inodes);
3245 3264 if (ret == -EINVAL)
3246 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3247 ret = -ENOENT; 3265 ret = -ENOENT;
3248 if (ret < 0) 3266 if (ret < 0)
3249 goto out; 3267 goto out;
3250 3268
3251 extent_item_pos = loi->logical - key.objectid;
3252 ret = iterate_extent_inodes(root->fs_info, key.objectid,
3253 extent_item_pos, 0, build_ino_list,
3254 inodes);
3255
3256 if (ret < 0)
3257 goto out;
3258
3259 ret = copy_to_user((void *)(unsigned long)loi->inodes, 3269 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3260 (void *)(unsigned long)inodes, size); 3270 (void *)(unsigned long)inodes, size);
3261 if (ret) 3271 if (ret)
@@ -3263,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3263 3273
3264out: 3274out:
3265 btrfs_free_path(path); 3275 btrfs_free_path(path);
3266 kfree(inodes); 3276 vfree(inodes);
3267 kfree(loi); 3277 kfree(loi);
3268 3278
3269 return ret; 3279 return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..7772f02ba28e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27 27
28static struct kmem_cache *btrfs_ordered_extent_cache;
29
28static u64 entry_end(struct btrfs_ordered_extent *entry) 30static u64 entry_end(struct btrfs_ordered_extent *entry)
29{ 31{
30 if (entry->file_offset + entry->len < entry->file_offset) 32 if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
187 struct btrfs_ordered_extent *entry; 189 struct btrfs_ordered_extent *entry;
188 190
189 tree = &BTRFS_I(inode)->ordered_tree; 191 tree = &BTRFS_I(inode)->ordered_tree;
190 entry = kzalloc(sizeof(*entry), GFP_NOFS); 192 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
191 if (!entry) 193 if (!entry)
192 return -ENOMEM; 194 return -ENOMEM;
193 195
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
421 list_del(&sum->list); 423 list_del(&sum->list);
422 kfree(sum); 424 kfree(sum);
423 } 425 }
424 kfree(entry); 426 kmem_cache_free(btrfs_ordered_extent_cache, entry);
425 } 427 }
426} 428}
427 429
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
466 * wait for all the ordered extents in a root. This is done when balancing 468 * wait for all the ordered extents in a root. This is done when balancing
467 * space between drives. 469 * space between drives.
468 */ 470 */
469void btrfs_wait_ordered_extents(struct btrfs_root *root, 471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
470 int nocow_only, int delay_iput)
471{ 472{
472 struct list_head splice; 473 struct list_head splice;
473 struct list_head *cur; 474 struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
482 cur = splice.next; 483 cur = splice.next;
483 ordered = list_entry(cur, struct btrfs_ordered_extent, 484 ordered = list_entry(cur, struct btrfs_ordered_extent,
484 root_extent_list); 485 root_extent_list);
485 if (nocow_only &&
486 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
487 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
488 list_move(&ordered->root_extent_list,
489 &root->fs_info->ordered_extents);
490 cond_resched_lock(&root->fs_info->ordered_extent_lock);
491 continue;
492 }
493
494 list_del_init(&ordered->root_extent_list); 486 list_del_init(&ordered->root_extent_list);
495 atomic_inc(&ordered->refs); 487 atomic_inc(&ordered->refs);
496 488
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
775 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 767 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
776 u64 disk_i_size; 768 u64 disk_i_size;
777 u64 new_i_size; 769 u64 new_i_size;
778 u64 i_size_test;
779 u64 i_size = i_size_read(inode); 770 u64 i_size = i_size_read(inode);
780 struct rb_node *node; 771 struct rb_node *node;
781 struct rb_node *prev = NULL; 772 struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
835 break; 826 break;
836 if (test->file_offset >= i_size) 827 if (test->file_offset >= i_size)
837 break; 828 break;
838 if (test->file_offset >= disk_i_size) 829 if (test->file_offset >= disk_i_size) {
830 /*
831 * we don't update disk_i_size now, so record this
832 * undealt i_size. Or we will not know the real
833 * i_size.
834 */
835 if (test->outstanding_isize < offset)
836 test->outstanding_isize = offset;
837 if (ordered &&
838 ordered->outstanding_isize >
839 test->outstanding_isize)
840 test->outstanding_isize =
841 ordered->outstanding_isize;
839 goto out; 842 goto out;
840 }
841 new_i_size = min_t(u64, offset, i_size);
842
843 /*
844 * at this point, we know we can safely update i_size to at least
845 * the offset from this ordered extent. But, we need to
846 * walk forward and see if ios from higher up in the file have
847 * finished.
848 */
849 if (ordered) {
850 node = rb_next(&ordered->rb_node);
851 } else {
852 if (prev)
853 node = rb_next(prev);
854 else
855 node = rb_first(&tree->tree);
856 }
857
858 /*
859 * We are looking for an area between our current extent and the next
860 * ordered extent to update the i_size to. There are 3 cases here
861 *
862 * 1) We don't actually have anything and we can update to i_size.
863 * 2) We have stuff but they already did their i_size update so again we
864 * can just update to i_size.
865 * 3) We have an outstanding ordered extent so the most we can update
866 * our disk_i_size to is the start of the next offset.
867 */
868 i_size_test = i_size;
869 for (; node; node = rb_next(node)) {
870 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
871
872 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
873 continue;
874 if (test->file_offset > offset) {
875 i_size_test = test->file_offset;
876 break;
877 } 843 }
878 } 844 }
845 new_i_size = min_t(u64, offset, i_size);
879 846
880 /* 847 /*
881 * i_size_test is the end of a region after this ordered 848 * Some ordered extents may completed before the current one, and
882 * extent where there are no ordered extents, we can safely set 849 * we hold the real i_size in ->outstanding_isize.
883 * disk_i_size to this.
884 */ 850 */
885 if (i_size_test > offset) 851 if (ordered && ordered->outstanding_isize > new_i_size)
886 new_i_size = min_t(u64, i_size_test, i_size); 852 new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
887 BTRFS_I(inode)->disk_i_size = new_i_size; 853 BTRFS_I(inode)->disk_i_size = new_i_size;
888 ret = 0; 854 ret = 0;
889out: 855out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
984 } 950 }
985 spin_unlock(&root->fs_info->ordered_extent_lock); 951 spin_unlock(&root->fs_info->ordered_extent_lock);
986} 952}
953
954int __init ordered_data_init(void)
955{
956 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
957 sizeof(struct btrfs_ordered_extent), 0,
958 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
959 NULL);
960 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM;
962 return 0;
963}
964
965void ordered_data_exit(void)
966{
967 if (btrfs_ordered_extent_cache)
968 kmem_cache_destroy(btrfs_ordered_extent_cache);
969}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..dd27a0b46a37 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 96 /* number of bytes that still need writing */
97 u64 bytes_left; 97 u64 bytes_left;
98 98
99 /*
100 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of
102 * btrfs_ordered_update_i_size();
103 */
104 u64 outstanding_isize;
105
99 /* flags (described above) */ 106 /* flags (described above) */
100 unsigned long flags; 107 unsigned long flags;
101 108
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
183void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root, 191 struct btrfs_root *root,
185 struct inode *inode); 192 struct inode *inode);
186void btrfs_wait_ordered_extents(struct btrfs_root *root, 193void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
187 int nocow_only, int delay_iput); 194int __init ordered_data_init(void);
195void ordered_data_exit(void);
188#endif 196#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 38b42e7bc91d..5039686df6ae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1145 1145
1146 ulist_reinit(tmp); 1146 ulist_reinit(tmp);
1147 /* XXX id not needed */ 1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1148 ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter); 1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist; 1151 struct btrfs_qgroup_list *glist;
1152 1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1153 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1154 if (qg->refcnt < seq) 1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1; 1155 qg->refcnt = seq + 1;
1156 else 1156 else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1158 1158
1159 list_for_each_entry(glist, &qg->groups, next_group) { 1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid, 1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group, 1161 (u64)(uintptr_t)glist->group,
1162 GFP_ATOMIC); 1162 GFP_ATOMIC);
1163 } 1163 }
1164 } 1164 }
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1168 * step 2: walk from the new root 1168 * step 2: walk from the new root
1169 */ 1169 */
1170 ulist_reinit(tmp); 1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1171 ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter); 1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) { 1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg; 1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist; 1175 struct btrfs_qgroup_list *glist;
1176 1176
1177 qg = (struct btrfs_qgroup *)unode->aux; 1177 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1178 if (qg->refcnt < seq) { 1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */ 1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes; 1180 qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1190 1190
1191 list_for_each_entry(glist, &qg->groups, next_group) { 1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid, 1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC); 1193 (uintptr_t)glist->group, GFP_ATOMIC);
1194 } 1194 }
1195 } 1195 }
1196 1196
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1208 continue; 1208 continue;
1209 1209
1210 ulist_reinit(tmp); 1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1211 ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter); 1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist; 1214 struct btrfs_qgroup_list *glist;
1215 1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1216 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1217 if (qg->tag == seq) 1217 if (qg->tag == seq)
1218 continue; 1218 continue;
1219 1219
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1225 1225
1226 list_for_each_entry(glist, &qg->groups, next_group) { 1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid, 1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group, 1228 (uintptr_t)glist->group,
1229 GFP_ATOMIC); 1229 GFP_ATOMIC);
1230 } 1230 }
1231 } 1231 }
@@ -1371,10 +1371,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1371 1371
1372 if (srcid) { 1372 if (srcid) {
1373 srcgroup = find_qgroup_rb(fs_info, srcid); 1373 srcgroup = find_qgroup_rb(fs_info, srcid);
1374 if (!srcgroup) { 1374 if (!srcgroup)
1375 ret = -EINVAL;
1376 goto unlock; 1375 goto unlock;
1377 }
1378 dstgroup->rfer = srcgroup->rfer - level_size; 1376 dstgroup->rfer = srcgroup->rfer - level_size;
1379 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; 1377 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
1380 srcgroup->excl = level_size; 1378 srcgroup->excl = level_size;
@@ -1383,10 +1381,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1383 qgroup_dirty(fs_info, srcgroup); 1381 qgroup_dirty(fs_info, srcgroup);
1384 } 1382 }
1385 1383
1386 if (!inherit) { 1384 if (!inherit)
1387 ret = -EINVAL;
1388 goto unlock; 1385 goto unlock;
1389 }
1390 1386
1391 i_qgroups = (u64 *)(inherit + 1); 1387 i_qgroups = (u64 *)(inherit + 1);
1392 for (i = 0; i < inherit->num_qgroups; ++i) { 1388 for (i = 0; i < inherit->num_qgroups; ++i) {
@@ -1473,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1473 * be exceeded 1469 * be exceeded
1474 */ 1470 */
1475 ulist = ulist_alloc(GFP_ATOMIC); 1471 ulist = ulist_alloc(GFP_ATOMIC);
1476 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1472 if (!ulist) {
1473 ret = -ENOMEM;
1474 goto out;
1475 }
1476 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1477 ULIST_ITER_INIT(&uiter); 1477 ULIST_ITER_INIT(&uiter);
1478 while ((unode = ulist_next(ulist, &uiter))) { 1478 while ((unode = ulist_next(ulist, &uiter))) {
1479 struct btrfs_qgroup *qg; 1479 struct btrfs_qgroup *qg;
1480 struct btrfs_qgroup_list *glist; 1480 struct btrfs_qgroup_list *glist;
1481 1481
1482 qg = (struct btrfs_qgroup *)unode->aux; 1482 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1483 1483
1484 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 1484 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1485 qg->reserved + qg->rfer + num_bytes > 1485 qg->reserved + qg->rfer + num_bytes >
@@ -1493,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1493 1493
1494 list_for_each_entry(glist, &qg->groups, next_group) { 1494 list_for_each_entry(glist, &qg->groups, next_group) {
1495 ulist_add(ulist, glist->group->qgroupid, 1495 ulist_add(ulist, glist->group->qgroupid,
1496 (unsigned long)glist->group, GFP_ATOMIC); 1496 (uintptr_t)glist->group, GFP_ATOMIC);
1497 } 1497 }
1498 } 1498 }
1499 if (ret) 1499 if (ret)
@@ -1506,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1506 while ((unode = ulist_next(ulist, &uiter))) { 1506 while ((unode = ulist_next(ulist, &uiter))) {
1507 struct btrfs_qgroup *qg; 1507 struct btrfs_qgroup *qg;
1508 1508
1509 qg = (struct btrfs_qgroup *)unode->aux; 1509 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1510 1510
1511 qg->reserved += num_bytes; 1511 qg->reserved += num_bytes;
1512 } 1512 }
@@ -1545,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1545 goto out; 1545 goto out;
1546 1546
1547 ulist = ulist_alloc(GFP_ATOMIC); 1547 ulist = ulist_alloc(GFP_ATOMIC);
1548 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1548 if (!ulist) {
1549 btrfs_std_error(fs_info, -ENOMEM);
1550 goto out;
1551 }
1552 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1549 ULIST_ITER_INIT(&uiter); 1553 ULIST_ITER_INIT(&uiter);
1550 while ((unode = ulist_next(ulist, &uiter))) { 1554 while ((unode = ulist_next(ulist, &uiter))) {
1551 struct btrfs_qgroup *qg; 1555 struct btrfs_qgroup *qg;
1552 struct btrfs_qgroup_list *glist; 1556 struct btrfs_qgroup_list *glist;
1553 1557
1554 qg = (struct btrfs_qgroup *)unode->aux; 1558 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1555 1559
1556 qg->reserved -= num_bytes; 1560 qg->reserved -= num_bytes;
1557 1561
1558 list_for_each_entry(glist, &qg->groups, next_group) { 1562 list_for_each_entry(glist, &qg->groups, next_group) {
1559 ulist_add(ulist, glist->group->qgroupid, 1563 ulist_add(ulist, glist->group->qgroupid,
1560 (unsigned long)glist->group, GFP_ATOMIC); 1564 (uintptr_t)glist->group, GFP_ATOMIC);
1561 } 1565 }
1562 } 1566 }
1563 1567
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 48a4882d8ad5..a955669519a2 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -68,7 +68,7 @@ struct reada_extent {
68 u32 blocksize; 68 u32 blocksize;
69 int err; 69 int err;
70 struct list_head extctl; 70 struct list_head extctl;
71 struct kref refcnt; 71 int refcnt;
72 spinlock_t lock; 72 spinlock_t lock;
73 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 73 struct reada_zone *zones[BTRFS_MAX_MIRRORS];
74 int nzones; 74 int nzones;
@@ -126,7 +126,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
126 spin_lock(&fs_info->reada_lock); 126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index); 127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re) 128 if (re)
129 kref_get(&re->refcnt); 129 re->refcnt++;
130 spin_unlock(&fs_info->reada_lock); 130 spin_unlock(&fs_info->reada_lock);
131 131
132 if (!re) 132 if (!re)
@@ -336,7 +336,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
336 spin_lock(&fs_info->reada_lock); 336 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 337 re = radix_tree_lookup(&fs_info->reada_tree, index);
338 if (re) 338 if (re)
339 kref_get(&re->refcnt); 339 re->refcnt++;
340 spin_unlock(&fs_info->reada_lock); 340 spin_unlock(&fs_info->reada_lock);
341 341
342 if (re) 342 if (re)
@@ -352,7 +352,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
352 re->top = *top; 352 re->top = *top;
353 INIT_LIST_HEAD(&re->extctl); 353 INIT_LIST_HEAD(&re->extctl);
354 spin_lock_init(&re->lock); 354 spin_lock_init(&re->lock);
355 kref_init(&re->refcnt); 355 re->refcnt = 1;
356 356
357 /* 357 /*
358 * map block 358 * map block
@@ -398,7 +398,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
398 if (ret == -EEXIST) { 398 if (ret == -EEXIST) {
399 re_exist = radix_tree_lookup(&fs_info->reada_tree, index); 399 re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
400 BUG_ON(!re_exist); 400 BUG_ON(!re_exist);
401 kref_get(&re_exist->refcnt); 401 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 402 spin_unlock(&fs_info->reada_lock);
403 goto error; 403 goto error;
404 } 404 }
@@ -465,10 +465,6 @@ error:
465 return re_exist; 465 return re_exist;
466} 466}
467 467
468static void reada_kref_dummy(struct kref *kr)
469{
470}
471
472static void reada_extent_put(struct btrfs_fs_info *fs_info, 468static void reada_extent_put(struct btrfs_fs_info *fs_info,
473 struct reada_extent *re) 469 struct reada_extent *re)
474{ 470{
@@ -476,7 +472,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
476 unsigned long index = re->logical >> PAGE_CACHE_SHIFT; 472 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
477 473
478 spin_lock(&fs_info->reada_lock); 474 spin_lock(&fs_info->reada_lock);
479 if (!kref_put(&re->refcnt, reada_kref_dummy)) { 475 if (--re->refcnt) {
480 spin_unlock(&fs_info->reada_lock); 476 spin_unlock(&fs_info->reada_lock);
481 return; 477 return;
482 } 478 }
@@ -671,7 +667,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
671 return 0; 667 return 0;
672 } 668 }
673 dev->reada_next = re->logical + re->blocksize; 669 dev->reada_next = re->logical + re->blocksize;
674 kref_get(&re->refcnt); 670 re->refcnt++;
675 671
676 spin_unlock(&fs_info->reada_lock); 672 spin_unlock(&fs_info->reada_lock);
677 673
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..776f0aa128fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3270 key.offset = 0; 3270 key.offset = 0;
3271 3271
3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3273 if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { 3273 if (IS_ERR(inode) || is_bad_inode(inode)) {
3274 if (inode && !IS_ERR(inode)) 3274 if (!IS_ERR(inode))
3275 iput(inode); 3275 iput(inode);
3276 return -ENOENT; 3276 return -ENOENT;
3277 } 3277 }
@@ -3621,7 +3621,7 @@ next:
3621 3621
3622 ret = find_first_extent_bit(&rc->processed_blocks, 3622 ret = find_first_extent_bit(&rc->processed_blocks,
3623 key.objectid, &start, &end, 3623 key.objectid, &start, &end,
3624 EXTENT_DIRTY); 3624 EXTENT_DIRTY, NULL);
3625 3625
3626 if (ret == 0 && start <= key.objectid) { 3626 if (ret == 0 && start <= key.objectid) {
3627 btrfs_release_path(path); 3627 btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3674 struct btrfs_trans_handle *trans; 3674 struct btrfs_trans_handle *trans;
3675 int ret; 3675 int ret;
3676 3676
3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); 3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3678 BTRFS_BLOCK_RSV_TEMP);
3678 if (!rc->block_rsv) 3679 if (!rc->block_rsv)
3679 return -ENOMEM; 3680 return -ENOMEM;
3680 3681
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->flags); 4058 (unsigned long long)rc->block_group->flags);
4058 4059
4059 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4060 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4061 4062
4062 while (1) { 4063 while (1) {
4063 mutex_lock(&fs_info->cleaner_mutex); 4064 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..eb923d087da7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
141 return -ENOMEM; 141 return -ENOMEM;
142 142
143 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
144 if (ret < 0) 144 if (ret < 0) {
145 goto out_abort; 145 btrfs_abort_transaction(trans, root, ret);
146 goto out;
147 }
146 148
147 if (ret != 0) { 149 if (ret != 0) {
148 btrfs_print_leaf(root, path->nodes[0]); 150 btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
166 btrfs_release_path(path); 168 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path, 169 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1); 170 -1, 1);
169 if (ret < 0) 171 if (ret < 0) {
170 goto out_abort; 172 btrfs_abort_transaction(trans, root, ret);
173 goto out;
174 }
175
171 ret = btrfs_del_item(trans, root, path); 176 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0) 177 if (ret < 0) {
173 goto out_abort; 178 btrfs_abort_transaction(trans, root, ret);
179 goto out;
180 }
174 btrfs_release_path(path); 181 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path, 182 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item)); 183 key, sizeof(*item));
177 if (ret < 0) 184 if (ret < 0) {
178 goto out_abort; 185 btrfs_abort_transaction(trans, root, ret);
186 goto out;
187 }
179 l = path->nodes[0]; 188 l = path->nodes[0];
180 slot = path->slots[0]; 189 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot); 190 ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
192out: 201out:
193 btrfs_free_path(path); 202 btrfs_free_path(path);
194 return ret; 203 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
199} 204}
200 205
201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 206int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..27892f67e69b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
352 struct extent_buffer *eb; 352 struct extent_buffer *eb;
353 struct btrfs_extent_item *ei; 353 struct btrfs_extent_item *ei;
354 struct scrub_warning swarn; 354 struct scrub_warning swarn;
355 u32 item_size; 355 unsigned long ptr = 0;
356 int ret; 356 u64 extent_item_pos;
357 u64 flags = 0;
357 u64 ref_root; 358 u64 ref_root;
359 u32 item_size;
358 u8 ref_level; 360 u8 ref_level;
359 unsigned long ptr = 0;
360 const int bufsize = 4096; 361 const int bufsize = 4096;
361 u64 extent_item_pos; 362 int ret;
362 363
363 path = btrfs_alloc_path(); 364 path = btrfs_alloc_path();
364 365
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
375 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 376 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
376 goto out; 377 goto out;
377 378
378 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); 379 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
380 &flags);
379 if (ret < 0) 381 if (ret < 0)
380 goto out; 382 goto out;
381 383
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
387 item_size = btrfs_item_size_nr(eb, path->slots[0]); 389 item_size = btrfs_item_size_nr(eb, path->slots[0]);
388 btrfs_release_path(path); 390 btrfs_release_path(path);
389 391
390 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 392 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
391 do { 393 do {
392 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 394 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
393 &ref_root, &ref_level); 395 &ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1029 spin_lock(&sdev->stat_lock); 1031 spin_lock(&sdev->stat_lock);
1030 sdev->stat.malloc_errors++; 1032 sdev->stat.malloc_errors++;
1031 spin_unlock(&sdev->stat_lock); 1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1032 return -ENOMEM; 1035 return -ENOMEM;
1033 } 1036 }
1034 sblock->page_count++; 1037 sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1666 scrub_block_put(sblock); 1669 scrub_block_put(sblock);
1667 } 1670 }
1668 1671
1669 if (sbio->err) {
1670 /* what is this good for??? */
1671 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1672 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
1673 sbio->bio->bi_phys_segments = 0;
1674 sbio->bio->bi_idx = 0;
1675
1676 for (i = 0; i < sbio->page_count; i++) {
1677 struct bio_vec *bi;
1678 bi = &sbio->bio->bi_io_vec[i];
1679 bi->bv_offset = 0;
1680 bi->bv_len = PAGE_SIZE;
1681 }
1682 }
1683
1684 bio_put(sbio->bio); 1672 bio_put(sbio->bio);
1685 sbio->bio = NULL; 1673 sbio->bio = NULL;
1686 spin_lock(&sdev->list_lock); 1674 spin_lock(&sdev->list_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..c7beb543a4a8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
107 int cur_inode_new; 107 int cur_inode_new;
108 int cur_inode_new_gen; 108 int cur_inode_new_gen;
109 int cur_inode_deleted; 109 int cur_inode_deleted;
110 int cur_inode_first_ref_orphan;
111 u64 cur_inode_size; 110 u64 cur_inode_size;
112 u64 cur_inode_mode; 111 u64 cur_inode_mode;
113 112
@@ -126,7 +125,15 @@ struct send_ctx {
126 125
127struct name_cache_entry { 126struct name_cache_entry {
128 struct list_head list; 127 struct list_head list;
129 struct list_head use_list; 128 /*
129 * radix_tree has only 32bit entries but we need to handle 64bit inums.
130 * We use the lower 32bit of the 64bit inum to store it in the tree. If
131 * more then one inum would fall into the same entry, we use radix_list
132 * to store the additional entries. radix_list is also used to store
133 * entries where two entries have the same inum but different
134 * generations.
135 */
136 struct list_head radix_list;
130 u64 ino; 137 u64 ino;
131 u64 gen; 138 u64 gen;
132 u64 parent_ino; 139 u64 parent_ino;
@@ -328,6 +335,7 @@ out:
328 return ret; 335 return ret;
329} 336}
330 337
338#if 0
331static void fs_path_remove(struct fs_path *p) 339static void fs_path_remove(struct fs_path *p)
332{ 340{
333 BUG_ON(p->reversed); 341 BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
335 p->end--; 343 p->end--;
336 *p->end = 0; 344 *p->end = 0;
337} 345}
346#endif
338 347
339static int fs_path_copy(struct fs_path *p, struct fs_path *from) 348static int fs_path_copy(struct fs_path *p, struct fs_path *from)
340{ 349{
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
377 return path; 386 return path;
378} 387}
379 388
380static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) 389int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
381{ 390{
382 int ret; 391 int ret;
383 mm_segment_t old_fs; 392 mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
387 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
388 397
389 while (pos < len) { 398 while (pos < len) {
390 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, 399 ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
391 &sctx->send_off);
392 /* TODO handle that correctly */ 400 /* TODO handle that correctly */
393 /*if (ret == -ERESTARTSYS) { 401 /*if (ret == -ERESTARTSYS) {
394 continue; 402 continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
544 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 552 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
545 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); 553 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
546 554
547 return write_buf(sctx, &hdr, sizeof(hdr)); 555 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
556 &sctx->send_off);
548} 557}
549 558
550/* 559/*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
581 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 590 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
582 hdr->crc = cpu_to_le32(crc); 591 hdr->crc = cpu_to_le32(crc);
583 592
584 ret = write_buf(sctx, sctx->send_buf, sctx->send_size); 593 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
594 &sctx->send_off);
585 595
586 sctx->total_send_size += sctx->send_size; 596 sctx->total_send_size += sctx->send_size;
587 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; 597 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
687 */ 697 */
688static int get_inode_info(struct btrfs_root *root, 698static int get_inode_info(struct btrfs_root *root,
689 u64 ino, u64 *size, u64 *gen, 699 u64 ino, u64 *size, u64 *gen,
690 u64 *mode, u64 *uid, u64 *gid) 700 u64 *mode, u64 *uid, u64 *gid,
701 u64 *rdev)
691{ 702{
692 int ret; 703 int ret;
693 struct btrfs_inode_item *ii; 704 struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
721 *uid = btrfs_inode_uid(path->nodes[0], ii); 732 *uid = btrfs_inode_uid(path->nodes[0], ii);
722 if (gid) 733 if (gid)
723 *gid = btrfs_inode_gid(path->nodes[0], ii); 734 *gid = btrfs_inode_gid(path->nodes[0], ii);
735 if (rdev)
736 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
724 737
725out: 738out:
726 btrfs_free_path(path); 739 btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
852 struct extent_buffer *eb; 865 struct extent_buffer *eb;
853 struct btrfs_item *item; 866 struct btrfs_item *item;
854 struct btrfs_dir_item *di; 867 struct btrfs_dir_item *di;
855 struct btrfs_path *tmp_path = NULL;
856 struct btrfs_key di_key; 868 struct btrfs_key di_key;
857 char *buf = NULL; 869 char *buf = NULL;
858 char *buf2 = NULL; 870 char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
874 goto out; 886 goto out;
875 } 887 }
876 888
877 tmp_path = alloc_path_for_send();
878 if (!tmp_path) {
879 ret = -ENOMEM;
880 goto out;
881 }
882
883 eb = path->nodes[0]; 889 eb = path->nodes[0];
884 slot = path->slots[0]; 890 slot = path->slots[0];
885 item = btrfs_item_nr(eb, slot); 891 item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
941 } 947 }
942 948
943out: 949out:
944 btrfs_free_path(tmp_path);
945 if (buf_virtual) 950 if (buf_virtual)
946 vfree(buf); 951 vfree(buf);
947 else 952 else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
1026 u64 extent_len; 1031 u64 extent_len;
1027 1032
1028 /* Just to check for bugs in backref resolving */ 1033 /* Just to check for bugs in backref resolving */
1029 int found_in_send_root; 1034 int found_itself;
1030}; 1035};
1031 1036
1032static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1037static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1033{ 1038{
1034 u64 root = (u64)key; 1039 u64 root = (u64)(uintptr_t)key;
1035 struct clone_root *cr = (struct clone_root *)elt; 1040 struct clone_root *cr = (struct clone_root *)elt;
1036 1041
1037 if (root < cr->root->objectid) 1042 if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
1055 1060
1056/* 1061/*
1057 * Called for every backref that is found for the current extent. 1062 * Called for every backref that is found for the current extent.
1063 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1058 */ 1064 */
1059static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) 1065static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1060{ 1066{
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1064 u64 i_size; 1070 u64 i_size;
1065 1071
1066 /* First check if the root is in the list of accepted clone sources */ 1072 /* First check if the root is in the list of accepted clone sources */
1067 found = bsearch((void *)root, bctx->sctx->clone_roots, 1073 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1068 bctx->sctx->clone_roots_cnt, 1074 bctx->sctx->clone_roots_cnt,
1069 sizeof(struct clone_root), 1075 sizeof(struct clone_root),
1070 __clone_root_cmp_bsearch); 1076 __clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1074 if (found->root == bctx->sctx->send_root && 1080 if (found->root == bctx->sctx->send_root &&
1075 ino == bctx->cur_objectid && 1081 ino == bctx->cur_objectid &&
1076 offset == bctx->cur_offset) { 1082 offset == bctx->cur_offset) {
1077 bctx->found_in_send_root = 1; 1083 bctx->found_itself = 1;
1078 } 1084 }
1079 1085
1080 /* 1086 /*
1081 * There are inodes that have extents that lie behind it's i_size. Don't 1087 * There are inodes that have extents that lie behind its i_size. Don't
1082 * accept clones from these extents. 1088 * accept clones from these extents.
1083 */ 1089 */
1084 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); 1090 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
1091 NULL);
1085 if (ret < 0) 1092 if (ret < 0)
1086 return ret; 1093 return ret;
1087 1094
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1101 */ 1108 */
1102 if (ino >= bctx->cur_objectid) 1109 if (ino >= bctx->cur_objectid)
1103 return 0; 1110 return 0;
1104 /*if (ino > ctx->cur_objectid) 1111#if 0
1112 if (ino > bctx->cur_objectid)
1105 return 0; 1113 return 0;
1106 if (offset + ctx->extent_len > ctx->cur_offset) 1114 if (offset + bctx->extent_len > bctx->cur_offset)
1107 return 0;*/ 1115 return 0;
1108 1116#endif
1109 bctx->found++;
1110 found->found_refs++;
1111 found->ino = ino;
1112 found->offset = offset;
1113 return 0;
1114 } 1117 }
1115 1118
1116 bctx->found++; 1119 bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1130} 1133}
1131 1134
1132/* 1135/*
1136 * Given an inode, offset and extent item, it finds a good clone for a clone
1137 * instruction. Returns -ENOENT when none could be found. The function makes
1138 * sure that the returned clone is usable at the point where sending is at the
1139 * moment. This means, that no clones are accepted which lie behind the current
1140 * inode+offset.
1141 *
1133 * path must point to the extent item when called. 1142 * path must point to the extent item when called.
1134 */ 1143 */
1135static int find_extent_clone(struct send_ctx *sctx, 1144static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
1141 int ret; 1150 int ret;
1142 int extent_type; 1151 int extent_type;
1143 u64 logical; 1152 u64 logical;
1153 u64 disk_byte;
1144 u64 num_bytes; 1154 u64 num_bytes;
1145 u64 extent_item_pos; 1155 u64 extent_item_pos;
1156 u64 flags = 0;
1146 struct btrfs_file_extent_item *fi; 1157 struct btrfs_file_extent_item *fi;
1147 struct extent_buffer *eb = path->nodes[0]; 1158 struct extent_buffer *eb = path->nodes[0];
1148 struct backref_ctx backref_ctx; 1159 struct backref_ctx *backref_ctx = NULL;
1149 struct clone_root *cur_clone_root; 1160 struct clone_root *cur_clone_root;
1150 struct btrfs_key found_key; 1161 struct btrfs_key found_key;
1151 struct btrfs_path *tmp_path; 1162 struct btrfs_path *tmp_path;
1163 int compressed;
1152 u32 i; 1164 u32 i;
1153 1165
1154 tmp_path = alloc_path_for_send(); 1166 tmp_path = alloc_path_for_send();
1155 if (!tmp_path) 1167 if (!tmp_path)
1156 return -ENOMEM; 1168 return -ENOMEM;
1157 1169
1170 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1171 if (!backref_ctx) {
1172 ret = -ENOMEM;
1173 goto out;
1174 }
1175
1158 if (data_offset >= ino_size) { 1176 if (data_offset >= ino_size) {
1159 /* 1177 /*
1160 * There may be extents that lie behind the file's size. 1178 * There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
1172 ret = -ENOENT; 1190 ret = -ENOENT;
1173 goto out; 1191 goto out;
1174 } 1192 }
1193 compressed = btrfs_file_extent_compression(eb, fi);
1175 1194
1176 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1195 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1177 logical = btrfs_file_extent_disk_bytenr(eb, fi); 1196 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1178 if (logical == 0) { 1197 if (disk_byte == 0) {
1179 ret = -ENOENT; 1198 ret = -ENOENT;
1180 goto out; 1199 goto out;
1181 } 1200 }
1182 logical += btrfs_file_extent_offset(eb, fi); 1201 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1183 1202
1184 ret = extent_from_logical(sctx->send_root->fs_info, 1203 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1185 logical, tmp_path, &found_key); 1204 &found_key, &flags);
1186 btrfs_release_path(tmp_path); 1205 btrfs_release_path(tmp_path);
1187 1206
1188 if (ret < 0) 1207 if (ret < 0)
1189 goto out; 1208 goto out;
1190 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1209 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1191 ret = -EIO; 1210 ret = -EIO;
1192 goto out; 1211 goto out;
1193 } 1212 }
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
1202 cur_clone_root->found_refs = 0; 1221 cur_clone_root->found_refs = 0;
1203 } 1222 }
1204 1223
1205 backref_ctx.sctx = sctx; 1224 backref_ctx->sctx = sctx;
1206 backref_ctx.found = 0; 1225 backref_ctx->found = 0;
1207 backref_ctx.cur_objectid = ino; 1226 backref_ctx->cur_objectid = ino;
1208 backref_ctx.cur_offset = data_offset; 1227 backref_ctx->cur_offset = data_offset;
1209 backref_ctx.found_in_send_root = 0; 1228 backref_ctx->found_itself = 0;
1210 backref_ctx.extent_len = num_bytes; 1229 backref_ctx->extent_len = num_bytes;
1211 1230
1212 /* 1231 /*
1213 * The last extent of a file may be too large due to page alignment. 1232 * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
1215 * __iterate_backrefs work. 1234 * __iterate_backrefs work.
1216 */ 1235 */
1217 if (data_offset + num_bytes >= ino_size) 1236 if (data_offset + num_bytes >= ino_size)
1218 backref_ctx.extent_len = ino_size - data_offset; 1237 backref_ctx->extent_len = ino_size - data_offset;
1219 1238
1220 /* 1239 /*
1221 * Now collect all backrefs. 1240 * Now collect all backrefs.
1222 */ 1241 */
1242 if (compressed == BTRFS_COMPRESS_NONE)
1243 extent_item_pos = logical - found_key.objectid;
1244 else
1245 extent_item_pos = 0;
1246
1223 extent_item_pos = logical - found_key.objectid; 1247 extent_item_pos = logical - found_key.objectid;
1224 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1248 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1225 found_key.objectid, extent_item_pos, 1, 1249 found_key.objectid, extent_item_pos, 1,
1226 __iterate_backrefs, &backref_ctx); 1250 __iterate_backrefs, backref_ctx);
1251
1227 if (ret < 0) 1252 if (ret < 0)
1228 goto out; 1253 goto out;
1229 1254
1230 if (!backref_ctx.found_in_send_root) { 1255 if (!backref_ctx->found_itself) {
1231 /* found a bug in backref code? */ 1256 /* found a bug in backref code? */
1232 ret = -EIO; 1257 ret = -EIO;
1233 printk(KERN_ERR "btrfs: ERROR did not find backref in " 1258 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1234 "send_root. inode=%llu, offset=%llu, " 1259 "send_root. inode=%llu, offset=%llu, "
1235 "logical=%llu\n", 1260 "disk_byte=%llu found extent=%llu\n",
1236 ino, data_offset, logical); 1261 ino, data_offset, disk_byte, found_key.objectid);
1237 goto out; 1262 goto out;
1238 } 1263 }
1239 1264
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1242 "num_bytes=%llu, logical=%llu\n", 1267 "num_bytes=%llu, logical=%llu\n",
1243 data_offset, ino, num_bytes, logical); 1268 data_offset, ino, num_bytes, logical);
1244 1269
1245 if (!backref_ctx.found) 1270 if (!backref_ctx->found)
1246 verbose_printk("btrfs: no clones found\n"); 1271 verbose_printk("btrfs: no clones found\n");
1247 1272
1248 cur_clone_root = NULL; 1273 cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1253 else if (sctx->clone_roots[i].root == sctx->send_root) 1278 else if (sctx->clone_roots[i].root == sctx->send_root)
1254 /* prefer clones from send_root over others */ 1279 /* prefer clones from send_root over others */
1255 cur_clone_root = sctx->clone_roots + i; 1280 cur_clone_root = sctx->clone_roots + i;
1256 break;
1257 } 1281 }
1258 1282
1259 } 1283 }
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1267 1291
1268out: 1292out:
1269 btrfs_free_path(tmp_path); 1293 btrfs_free_path(tmp_path);
1294 kfree(backref_ctx);
1270 return ret; 1295 return ret;
1271} 1296}
1272 1297
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
1307 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 1332 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1308 1333
1309 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1334 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1310 if (ret < 0)
1311 goto out;
1312 1335
1313out: 1336out:
1314 btrfs_free_path(path); 1337 btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1404 u64 right_gen; 1427 u64 right_gen;
1405 1428
1406 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, 1429 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1407 NULL); 1430 NULL, NULL);
1408 if (ret < 0 && ret != -ENOENT) 1431 if (ret < 0 && ret != -ENOENT)
1409 goto out; 1432 goto out;
1410 left_ret = ret; 1433 left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1413 right_ret = -ENOENT; 1436 right_ret = -ENOENT;
1414 } else { 1437 } else {
1415 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, 1438 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1416 NULL, NULL, NULL); 1439 NULL, NULL, NULL, NULL);
1417 if (ret < 0 && ret != -ENOENT) 1440 if (ret < 0 && ret != -ENOENT)
1418 goto out; 1441 goto out;
1419 right_ret = ret; 1442 right_ret = ret;
1420 } 1443 }
1421 1444
1422 if (!left_ret && !right_ret) { 1445 if (!left_ret && !right_ret) {
1423 if (left_gen == gen && right_gen == gen) 1446 if (left_gen == gen && right_gen == gen) {
1424 ret = inode_state_no_change; 1447 ret = inode_state_no_change;
1425 else if (left_gen == gen) { 1448 } else if (left_gen == gen) {
1426 if (ino < sctx->send_progress) 1449 if (ino < sctx->send_progress)
1427 ret = inode_state_did_create; 1450 ret = inode_state_did_create;
1428 else 1451 else
@@ -1516,6 +1539,10 @@ out:
1516 return ret; 1539 return ret;
1517} 1540}
1518 1541
1542/*
1543 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1544 * generation of the parent dir and the name of the dir entry.
1545 */
1519static int get_first_ref(struct send_ctx *sctx, 1546static int get_first_ref(struct send_ctx *sctx,
1520 struct btrfs_root *root, u64 ino, 1547 struct btrfs_root *root, u64 ino,
1521 u64 *dir, u64 *dir_gen, struct fs_path *name) 1548 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
1557 btrfs_release_path(path); 1584 btrfs_release_path(path);
1558 1585
1559 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, 1586 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1560 NULL); 1587 NULL, NULL);
1561 if (ret < 0) 1588 if (ret < 0)
1562 goto out; 1589 goto out;
1563 1590
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
1586 if (ret < 0) 1613 if (ret < 0)
1587 goto out; 1614 goto out;
1588 1615
1589 if (name_len != fs_path_len(tmp_name)) { 1616 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1590 ret = 0; 1617 ret = 0;
1591 goto out; 1618 goto out;
1592 } 1619 }
1593 1620
1594 ret = memcmp(tmp_name->start, name, name_len); 1621 ret = !memcmp(tmp_name->start, name, name_len);
1595 if (ret)
1596 ret = 0;
1597 else
1598 ret = 1;
1599 1622
1600out: 1623out:
1601 fs_path_free(sctx, tmp_name); 1624 fs_path_free(sctx, tmp_name);
1602 return ret; 1625 return ret;
1603} 1626}
1604 1627
1628/*
1629 * Used by process_recorded_refs to determine if a new ref would overwrite an
1630 * already existing ref. In case it detects an overwrite, it returns the
1631 * inode/gen in who_ino/who_gen.
1632 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1633 * to make sure later references to the overwritten inode are possible.
1634 * Orphanizing is however only required for the first ref of an inode.
1635 * process_recorded_refs does an additional is_first_ref check to see if
1636 * orphanizing is really required.
1637 */
1605static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 1638static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1606 const char *name, int name_len, 1639 const char *name, int name_len,
1607 u64 *who_ino, u64 *who_gen) 1640 u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1626 goto out; 1659 goto out;
1627 } 1660 }
1628 1661
1662 /*
1663 * Check if the overwritten ref was already processed. If yes, the ref
1664 * was already unlinked/moved, so we can safely assume that we will not
1665 * overwrite anything at this point in time.
1666 */
1629 if (other_inode > sctx->send_progress) { 1667 if (other_inode > sctx->send_progress) {
1630 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1668 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1631 who_gen, NULL, NULL, NULL); 1669 who_gen, NULL, NULL, NULL, NULL);
1632 if (ret < 0) 1670 if (ret < 0)
1633 goto out; 1671 goto out;
1634 1672
@@ -1642,6 +1680,13 @@ out:
1642 return ret; 1680 return ret;
1643} 1681}
1644 1682
1683/*
1684 * Checks if the ref was overwritten by an already processed inode. This is
1685 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1686 * thus the orphan name needs be used.
1687 * process_recorded_refs also uses it to avoid unlinking of refs that were
1688 * overwritten.
1689 */
1645static int did_overwrite_ref(struct send_ctx *sctx, 1690static int did_overwrite_ref(struct send_ctx *sctx,
1646 u64 dir, u64 dir_gen, 1691 u64 dir, u64 dir_gen,
1647 u64 ino, u64 ino_gen, 1692 u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
1671 } 1716 }
1672 1717
1673 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, 1718 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1674 NULL); 1719 NULL, NULL);
1675 if (ret < 0) 1720 if (ret < 0)
1676 goto out; 1721 goto out;
1677 1722
@@ -1690,6 +1735,11 @@ out:
1690 return ret; 1735 return ret;
1691} 1736}
1692 1737
1738/*
1739 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1740 * that got overwritten. This is used by process_recorded_refs to determine
1741 * if it has to use the path as returned by get_cur_path or the orphan name.
1742 */
1693static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 1743static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1694{ 1744{
1695 int ret = 0; 1745 int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1710 1760
1711 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 1761 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1712 name->start, fs_path_len(name)); 1762 name->start, fs_path_len(name));
1713 if (ret < 0)
1714 goto out;
1715 1763
1716out: 1764out:
1717 fs_path_free(sctx, name); 1765 fs_path_free(sctx, name);
1718 return ret; 1766 return ret;
1719} 1767}
1720 1768
1769/*
1770 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
1771 * so we need to do some special handling in case we have clashes. This function
1772 * takes care of this with the help of name_cache_entry::radix_list.
1773 * In case of error, nce is kfreed.
1774 */
1721static int name_cache_insert(struct send_ctx *sctx, 1775static int name_cache_insert(struct send_ctx *sctx,
1722 struct name_cache_entry *nce) 1776 struct name_cache_entry *nce)
1723{ 1777{
1724 int ret = 0; 1778 int ret = 0;
1725 struct name_cache_entry **ncea; 1779 struct list_head *nce_head;
1726 1780
1727 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); 1781 nce_head = radix_tree_lookup(&sctx->name_cache,
1728 if (ncea) { 1782 (unsigned long)nce->ino);
1729 if (!ncea[0]) 1783 if (!nce_head) {
1730 ncea[0] = nce; 1784 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1731 else if (!ncea[1]) 1785 if (!nce_head)
1732 ncea[1] = nce;
1733 else
1734 BUG();
1735 } else {
1736 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1737 if (!ncea)
1738 return -ENOMEM; 1786 return -ENOMEM;
1787 INIT_LIST_HEAD(nce_head);
1739 1788
1740 ncea[0] = nce; 1789 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
1741 ncea[1] = NULL; 1790 if (ret < 0) {
1742 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); 1791 kfree(nce_head);
1743 if (ret < 0) 1792 kfree(nce);
1744 return ret; 1793 return ret;
1794 }
1745 } 1795 }
1796 list_add_tail(&nce->radix_list, nce_head);
1746 list_add_tail(&nce->list, &sctx->name_cache_list); 1797 list_add_tail(&nce->list, &sctx->name_cache_list);
1747 sctx->name_cache_size++; 1798 sctx->name_cache_size++;
1748 1799
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
1752static void name_cache_delete(struct send_ctx *sctx, 1803static void name_cache_delete(struct send_ctx *sctx,
1753 struct name_cache_entry *nce) 1804 struct name_cache_entry *nce)
1754{ 1805{
1755 struct name_cache_entry **ncea; 1806 struct list_head *nce_head;
1756
1757 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1758 BUG_ON(!ncea);
1759
1760 if (ncea[0] == nce)
1761 ncea[0] = NULL;
1762 else if (ncea[1] == nce)
1763 ncea[1] = NULL;
1764 else
1765 BUG();
1766 1807
1767 if (!ncea[0] && !ncea[1]) { 1808 nce_head = radix_tree_lookup(&sctx->name_cache,
1768 radix_tree_delete(&sctx->name_cache, nce->ino); 1809 (unsigned long)nce->ino);
1769 kfree(ncea); 1810 BUG_ON(!nce_head);
1770 }
1771 1811
1812 list_del(&nce->radix_list);
1772 list_del(&nce->list); 1813 list_del(&nce->list);
1773
1774 sctx->name_cache_size--; 1814 sctx->name_cache_size--;
1815
1816 if (list_empty(nce_head)) {
1817 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1818 kfree(nce_head);
1819 }
1775} 1820}
1776 1821
1777static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 1822static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1778 u64 ino, u64 gen) 1823 u64 ino, u64 gen)
1779{ 1824{
1780 struct name_cache_entry **ncea; 1825 struct list_head *nce_head;
1826 struct name_cache_entry *cur;
1781 1827
1782 ncea = radix_tree_lookup(&sctx->name_cache, ino); 1828 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
1783 if (!ncea) 1829 if (!nce_head)
1784 return NULL; 1830 return NULL;
1785 1831
1786 if (ncea[0] && ncea[0]->gen == gen) 1832 list_for_each_entry(cur, nce_head, radix_list) {
1787 return ncea[0]; 1833 if (cur->ino == ino && cur->gen == gen)
1788 else if (ncea[1] && ncea[1]->gen == gen) 1834 return cur;
1789 return ncea[1]; 1835 }
1790 return NULL; 1836 return NULL;
1791} 1837}
1792 1838
1839/*
1840 * Removes the entry from the list and adds it back to the end. This marks the
1841 * entry as recently used so that name_cache_clean_unused does not remove it.
1842 */
1793static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) 1843static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1794{ 1844{
1795 list_del(&nce->list); 1845 list_del(&nce->list);
1796 list_add_tail(&nce->list, &sctx->name_cache_list); 1846 list_add_tail(&nce->list, &sctx->name_cache_list);
1797} 1847}
1798 1848
1849/*
1850 * Remove some entries from the beginning of name_cache_list.
1851 */
1799static void name_cache_clean_unused(struct send_ctx *sctx) 1852static void name_cache_clean_unused(struct send_ctx *sctx)
1800{ 1853{
1801 struct name_cache_entry *nce; 1854 struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
1814static void name_cache_free(struct send_ctx *sctx) 1867static void name_cache_free(struct send_ctx *sctx)
1815{ 1868{
1816 struct name_cache_entry *nce; 1869 struct name_cache_entry *nce;
1817 struct name_cache_entry *tmp;
1818 1870
1819 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { 1871 while (!list_empty(&sctx->name_cache_list)) {
1872 nce = list_entry(sctx->name_cache_list.next,
1873 struct name_cache_entry, list);
1820 name_cache_delete(sctx, nce); 1874 name_cache_delete(sctx, nce);
1875 kfree(nce);
1821 } 1876 }
1822} 1877}
1823 1878
1879/*
1880 * Used by get_cur_path for each ref up to the root.
1881 * Returns 0 if it succeeded.
1882 * Returns 1 if the inode is not existent or got overwritten. In that case, the
1883 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
1884 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
1885 * Returns <0 in case of error.
1886 */
1824static int __get_cur_name_and_parent(struct send_ctx *sctx, 1887static int __get_cur_name_and_parent(struct send_ctx *sctx,
1825 u64 ino, u64 gen, 1888 u64 ino, u64 gen,
1826 u64 *parent_ino, 1889 u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1832 struct btrfs_path *path = NULL; 1895 struct btrfs_path *path = NULL;
1833 struct name_cache_entry *nce = NULL; 1896 struct name_cache_entry *nce = NULL;
1834 1897
1898 /*
1899 * First check if we already did a call to this function with the same
1900 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
1901 * return the cached result.
1902 */
1835 nce = name_cache_search(sctx, ino, gen); 1903 nce = name_cache_search(sctx, ino, gen);
1836 if (nce) { 1904 if (nce) {
1837 if (ino < sctx->send_progress && nce->need_later_update) { 1905 if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1854 if (!path) 1922 if (!path)
1855 return -ENOMEM; 1923 return -ENOMEM;
1856 1924
1925 /*
1926 * If the inode is not existent yet, add the orphan name and return 1.
1927 * This should only happen for the parent dir that we determine in
1928 * __record_new_ref
1929 */
1857 ret = is_inode_existent(sctx, ino, gen); 1930 ret = is_inode_existent(sctx, ino, gen);
1858 if (ret < 0) 1931 if (ret < 0)
1859 goto out; 1932 goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1866 goto out_cache; 1939 goto out_cache;
1867 } 1940 }
1868 1941
1942 /*
1943 * Depending on whether the inode was already processed or not, use
1944 * send_root or parent_root for ref lookup.
1945 */
1869 if (ino < sctx->send_progress) 1946 if (ino < sctx->send_progress)
1870 ret = get_first_ref(sctx, sctx->send_root, ino, 1947 ret = get_first_ref(sctx, sctx->send_root, ino,
1871 parent_ino, parent_gen, dest); 1948 parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1875 if (ret < 0) 1952 if (ret < 0)
1876 goto out; 1953 goto out;
1877 1954
1955 /*
1956 * Check if the ref was overwritten by an inode's ref that was processed
1957 * earlier. If yes, treat as orphan and return 1.
1958 */
1878 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 1959 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1879 dest->start, dest->end - dest->start); 1960 dest->start, dest->end - dest->start);
1880 if (ret < 0) 1961 if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1888 } 1969 }
1889 1970
1890out_cache: 1971out_cache:
1972 /*
1973 * Store the result of the lookup in the name cache.
1974 */
1891 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 1975 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1892 if (!nce) { 1976 if (!nce) {
1893 ret = -ENOMEM; 1977 ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
1901 nce->name_len = fs_path_len(dest); 1985 nce->name_len = fs_path_len(dest);
1902 nce->ret = ret; 1986 nce->ret = ret;
1903 strcpy(nce->name, dest->start); 1987 strcpy(nce->name, dest->start);
1904 memset(&nce->use_list, 0, sizeof(nce->use_list));
1905 1988
1906 if (ino < sctx->send_progress) 1989 if (ino < sctx->send_progress)
1907 nce->need_later_update = 0; 1990 nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
2107 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2190 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2108 btrfs_release_path(path); 2191 btrfs_release_path(path);
2109 2192
2110 if (ret < 0)
2111 goto out;
2112
2113 if (parent_root) { 2193 if (parent_root) {
2114 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2194 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2115 if (ret < 0) 2195 if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2276 btrfs_inode_mtime(ii)); 2356 btrfs_inode_mtime(ii));
2277 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, 2357 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2278 btrfs_inode_ctime(ii)); 2358 btrfs_inode_ctime(ii));
2279 /* TODO otime? */ 2359 /* TODO Add otime support when the otime patches get into upstream */
2280 2360
2281 ret = send_cmd(sctx); 2361 ret = send_cmd(sctx);
2282 2362
@@ -2292,39 +2372,39 @@ out:
2292 * a valid path yet because we did not process the refs yet. So, the inode 2372 * a valid path yet because we did not process the refs yet. So, the inode
2293 * is created as orphan. 2373 * is created as orphan.
2294 */ 2374 */
2295static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, 2375static int send_create_inode(struct send_ctx *sctx, u64 ino)
2296 struct btrfs_key *key)
2297{ 2376{
2298 int ret = 0; 2377 int ret = 0;
2299 struct extent_buffer *eb = path->nodes[0];
2300 struct btrfs_inode_item *ii;
2301 struct fs_path *p; 2378 struct fs_path *p;
2302 int slot = path->slots[0];
2303 int cmd; 2379 int cmd;
2380 u64 gen;
2304 u64 mode; 2381 u64 mode;
2382 u64 rdev;
2305 2383
2306verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); 2384verbose_printk("btrfs: send_create_inode %llu\n", ino);
2307 2385
2308 p = fs_path_alloc(sctx); 2386 p = fs_path_alloc(sctx);
2309 if (!p) 2387 if (!p)
2310 return -ENOMEM; 2388 return -ENOMEM;
2311 2389
2312 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2390 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
2313 mode = btrfs_inode_mode(eb, ii); 2391 NULL, &rdev);
2392 if (ret < 0)
2393 goto out;
2314 2394
2315 if (S_ISREG(mode)) 2395 if (S_ISREG(mode)) {
2316 cmd = BTRFS_SEND_C_MKFILE; 2396 cmd = BTRFS_SEND_C_MKFILE;
2317 else if (S_ISDIR(mode)) 2397 } else if (S_ISDIR(mode)) {
2318 cmd = BTRFS_SEND_C_MKDIR; 2398 cmd = BTRFS_SEND_C_MKDIR;
2319 else if (S_ISLNK(mode)) 2399 } else if (S_ISLNK(mode)) {
2320 cmd = BTRFS_SEND_C_SYMLINK; 2400 cmd = BTRFS_SEND_C_SYMLINK;
2321 else if (S_ISCHR(mode) || S_ISBLK(mode)) 2401 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2322 cmd = BTRFS_SEND_C_MKNOD; 2402 cmd = BTRFS_SEND_C_MKNOD;
2323 else if (S_ISFIFO(mode)) 2403 } else if (S_ISFIFO(mode)) {
2324 cmd = BTRFS_SEND_C_MKFIFO; 2404 cmd = BTRFS_SEND_C_MKFIFO;
2325 else if (S_ISSOCK(mode)) 2405 } else if (S_ISSOCK(mode)) {
2326 cmd = BTRFS_SEND_C_MKSOCK; 2406 cmd = BTRFS_SEND_C_MKSOCK;
2327 else { 2407 } else {
2328 printk(KERN_WARNING "btrfs: unexpected inode type %o", 2408 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2329 (int)(mode & S_IFMT)); 2409 (int)(mode & S_IFMT));
2330 ret = -ENOTSUPP; 2410 ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2335 if (ret < 0) 2415 if (ret < 0)
2336 goto out; 2416 goto out;
2337 2417
2338 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 2418 ret = gen_unique_name(sctx, ino, gen, p);
2339 if (ret < 0) 2419 if (ret < 0)
2340 goto out; 2420 goto out;
2341 2421
2342 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2422 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2343 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); 2423 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2344 2424
2345 if (S_ISLNK(mode)) { 2425 if (S_ISLNK(mode)) {
2346 fs_path_reset(p); 2426 fs_path_reset(p);
2347 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); 2427 ret = read_symlink(sctx, sctx->send_root, ino, p);
2348 if (ret < 0) 2428 if (ret < 0)
2349 goto out; 2429 goto out;
2350 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2430 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2351 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2431 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2352 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2432 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2353 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); 2433 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
2354 } 2434 }
2355 2435
2356 ret = send_cmd(sctx); 2436 ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
2364 return ret; 2444 return ret;
2365} 2445}
2366 2446
2447/*
2448 * We need some special handling for inodes that get processed before the parent
2449 * directory got created. See process_recorded_refs for details.
2450 * This function does the check if we already created the dir out of order.
2451 */
2452static int did_create_dir(struct send_ctx *sctx, u64 dir)
2453{
2454 int ret = 0;
2455 struct btrfs_path *path = NULL;
2456 struct btrfs_key key;
2457 struct btrfs_key found_key;
2458 struct btrfs_key di_key;
2459 struct extent_buffer *eb;
2460 struct btrfs_dir_item *di;
2461 int slot;
2462
2463 path = alloc_path_for_send();
2464 if (!path) {
2465 ret = -ENOMEM;
2466 goto out;
2467 }
2468
2469 key.objectid = dir;
2470 key.type = BTRFS_DIR_INDEX_KEY;
2471 key.offset = 0;
2472 while (1) {
2473 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2474 1, 0);
2475 if (ret < 0)
2476 goto out;
2477 if (!ret) {
2478 eb = path->nodes[0];
2479 slot = path->slots[0];
2480 btrfs_item_key_to_cpu(eb, &found_key, slot);
2481 }
2482 if (ret || found_key.objectid != key.objectid ||
2483 found_key.type != key.type) {
2484 ret = 0;
2485 goto out;
2486 }
2487
2488 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2489 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2490
2491 if (di_key.objectid < sctx->send_progress) {
2492 ret = 1;
2493 goto out;
2494 }
2495
2496 key.offset = found_key.offset + 1;
2497 btrfs_release_path(path);
2498 }
2499
2500out:
2501 btrfs_free_path(path);
2502 return ret;
2503}
2504
2505/*
2506 * Only creates the inode if it is:
2507 * 1. Not a directory
2508 * 2. Or a directory which was not created already due to out of order
2509 * directories. See did_create_dir and process_recorded_refs for details.
2510 */
2511static int send_create_inode_if_needed(struct send_ctx *sctx)
2512{
2513 int ret;
2514
2515 if (S_ISDIR(sctx->cur_inode_mode)) {
2516 ret = did_create_dir(sctx, sctx->cur_ino);
2517 if (ret < 0)
2518 goto out;
2519 if (ret) {
2520 ret = 0;
2521 goto out;
2522 }
2523 }
2524
2525 ret = send_create_inode(sctx, sctx->cur_ino);
2526 if (ret < 0)
2527 goto out;
2528
2529out:
2530 return ret;
2531}
2532
2367struct recorded_ref { 2533struct recorded_ref {
2368 struct list_head list; 2534 struct list_head list;
2369 char *dir_path; 2535 char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
2416static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2582static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2417{ 2583{
2418 struct recorded_ref *cur; 2584 struct recorded_ref *cur;
2419 struct recorded_ref *tmp;
2420 2585
2421 list_for_each_entry_safe(cur, tmp, head, list) { 2586 while (!list_empty(head)) {
2587 cur = list_entry(head->next, struct recorded_ref, list);
2422 fs_path_free(sctx, cur->full_path); 2588 fs_path_free(sctx, cur->full_path);
2589 list_del(&cur->list);
2423 kfree(cur); 2590 kfree(cur);
2424 } 2591 }
2425 INIT_LIST_HEAD(head);
2426} 2592}
2427 2593
2428static void free_recorded_refs(struct send_ctx *sctx) 2594static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
2432} 2598}
2433 2599
2434/* 2600/*
2435 * Renames/moves a file/dir to it's orphan name. Used when the first 2601 * Renames/moves a file/dir to its orphan name. Used when the first
2436 * ref of an unprocessed inode gets overwritten and for all non empty 2602 * ref of an unprocessed inode gets overwritten and for all non empty
2437 * directories. 2603 * directories.
2438 */ 2604 */
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2472 struct btrfs_key loc; 2638 struct btrfs_key loc;
2473 struct btrfs_dir_item *di; 2639 struct btrfs_dir_item *di;
2474 2640
2641 /*
2642 * Don't try to rmdir the top/root subvolume dir.
2643 */
2644 if (dir == BTRFS_FIRST_FREE_OBJECTID)
2645 return 0;
2646
2475 path = alloc_path_for_send(); 2647 path = alloc_path_for_send();
2476 if (!path) 2648 if (!path)
2477 return -ENOMEM; 2649 return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
2513 return ret; 2685 return ret;
2514} 2686}
2515 2687
2516struct finish_unordered_dir_ctx {
2517 struct send_ctx *sctx;
2518 struct fs_path *cur_path;
2519 struct fs_path *dir_path;
2520 u64 dir_ino;
2521 int need_delete;
2522 int delete_pass;
2523};
2524
2525int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2526 const char *name, int name_len,
2527 const char *data, int data_len,
2528 u8 type, void *ctx)
2529{
2530 int ret = 0;
2531 struct finish_unordered_dir_ctx *fctx = ctx;
2532 struct send_ctx *sctx = fctx->sctx;
2533 u64 di_gen;
2534 u64 di_mode;
2535 int is_orphan = 0;
2536
2537 if (di_key->objectid >= fctx->dir_ino)
2538 goto out;
2539
2540 fs_path_reset(fctx->cur_path);
2541
2542 ret = get_inode_info(sctx->send_root, di_key->objectid,
2543 NULL, &di_gen, &di_mode, NULL, NULL);
2544 if (ret < 0)
2545 goto out;
2546
2547 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2548 fctx->dir_ino, name, name_len);
2549 if (ret < 0)
2550 goto out;
2551 if (ret) {
2552 is_orphan = 1;
2553 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2554 fctx->cur_path);
2555 } else {
2556 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2557 fctx->cur_path);
2558 }
2559 if (ret < 0)
2560 goto out;
2561
2562 ret = fs_path_add(fctx->dir_path, name, name_len);
2563 if (ret < 0)
2564 goto out;
2565
2566 if (!fctx->delete_pass) {
2567 if (S_ISDIR(di_mode)) {
2568 ret = send_rename(sctx, fctx->cur_path,
2569 fctx->dir_path);
2570 } else {
2571 ret = send_link(sctx, fctx->dir_path,
2572 fctx->cur_path);
2573 if (is_orphan)
2574 fctx->need_delete = 1;
2575 }
2576 } else if (!S_ISDIR(di_mode)) {
2577 ret = send_unlink(sctx, fctx->cur_path);
2578 } else {
2579 ret = 0;
2580 }
2581
2582 fs_path_remove(fctx->dir_path);
2583
2584out:
2585 return ret;
2586}
2587
2588/*
2589 * Go through all dir items and see if we find refs which could not be created
2590 * in the past because the dir did not exist at that time.
2591 */
2592static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2593{
2594 int ret = 0;
2595 struct btrfs_path *path = NULL;
2596 struct btrfs_key key;
2597 struct btrfs_key found_key;
2598 struct extent_buffer *eb;
2599 struct finish_unordered_dir_ctx fctx;
2600 int slot;
2601
2602 path = alloc_path_for_send();
2603 if (!path) {
2604 ret = -ENOMEM;
2605 goto out;
2606 }
2607
2608 memset(&fctx, 0, sizeof(fctx));
2609 fctx.sctx = sctx;
2610 fctx.cur_path = fs_path_alloc(sctx);
2611 fctx.dir_path = fs_path_alloc(sctx);
2612 if (!fctx.cur_path || !fctx.dir_path) {
2613 ret = -ENOMEM;
2614 goto out;
2615 }
2616 fctx.dir_ino = dir;
2617
2618 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2619 if (ret < 0)
2620 goto out;
2621
2622 /*
2623 * We do two passes. The first links in the new refs and the second
2624 * deletes orphans if required. Deletion of orphans is not required for
2625 * directory inodes, as we always have only one ref and use rename
2626 * instead of link for those.
2627 */
2628
2629again:
2630 key.objectid = dir;
2631 key.type = BTRFS_DIR_ITEM_KEY;
2632 key.offset = 0;
2633 while (1) {
2634 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2635 1, 0);
2636 if (ret < 0)
2637 goto out;
2638 eb = path->nodes[0];
2639 slot = path->slots[0];
2640 btrfs_item_key_to_cpu(eb, &found_key, slot);
2641
2642 if (found_key.objectid != key.objectid ||
2643 found_key.type != key.type) {
2644 btrfs_release_path(path);
2645 break;
2646 }
2647
2648 ret = iterate_dir_item(sctx, sctx->send_root, path,
2649 &found_key, __finish_unordered_dir,
2650 &fctx);
2651 if (ret < 0)
2652 goto out;
2653
2654 key.offset = found_key.offset + 1;
2655 btrfs_release_path(path);
2656 }
2657
2658 if (!fctx.delete_pass && fctx.need_delete) {
2659 fctx.delete_pass = 1;
2660 goto again;
2661 }
2662
2663out:
2664 btrfs_free_path(path);
2665 fs_path_free(sctx, fctx.cur_path);
2666 fs_path_free(sctx, fctx.dir_path);
2667 return ret;
2668}
2669
2670/* 2688/*
2671 * This does all the move/link/unlink/rmdir magic. 2689 * This does all the move/link/unlink/rmdir magic.
2672 */ 2690 */
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
2674{ 2692{
2675 int ret = 0; 2693 int ret = 0;
2676 struct recorded_ref *cur; 2694 struct recorded_ref *cur;
2695 struct recorded_ref *cur2;
2677 struct ulist *check_dirs = NULL; 2696 struct ulist *check_dirs = NULL;
2678 struct ulist_iterator uit; 2697 struct ulist_iterator uit;
2679 struct ulist_node *un; 2698 struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
2685 2704
2686verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 2705verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2687 2706
2707 /*
2708 * This should never happen as the root dir always has the same ref
2709 * which is always '..'
2710 */
2711 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2712
2688 valid_path = fs_path_alloc(sctx); 2713 valid_path = fs_path_alloc(sctx);
2689 if (!valid_path) { 2714 if (!valid_path) {
2690 ret = -ENOMEM; 2715 ret = -ENOMEM;
@@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2731 2756
2732 list_for_each_entry(cur, &sctx->new_refs, list) { 2757 list_for_each_entry(cur, &sctx->new_refs, list) {
2733 /* 2758 /*
2759 * We may have refs where the parent directory does not exist
2760 * yet. This happens if the parent directories inum is higher
2761 * the the current inum. To handle this case, we create the
2762 * parent directory out of order. But we need to check if this
2763 * did already happen before due to other refs in the same dir.
2764 */
2765 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
2766 if (ret < 0)
2767 goto out;
2768 if (ret == inode_state_will_create) {
2769 ret = 0;
2770 /*
2771 * First check if any of the current inodes refs did
2772 * already create the dir.
2773 */
2774 list_for_each_entry(cur2, &sctx->new_refs, list) {
2775 if (cur == cur2)
2776 break;
2777 if (cur2->dir == cur->dir) {
2778 ret = 1;
2779 break;
2780 }
2781 }
2782
2783 /*
2784 * If that did not happen, check if a previous inode
2785 * did already create the dir.
2786 */
2787 if (!ret)
2788 ret = did_create_dir(sctx, cur->dir);
2789 if (ret < 0)
2790 goto out;
2791 if (!ret) {
2792 ret = send_create_inode(sctx, cur->dir);
2793 if (ret < 0)
2794 goto out;
2795 }
2796 }
2797
2798 /*
2734 * Check if this new ref would overwrite the first ref of 2799 * Check if this new ref would overwrite the first ref of
2735 * another unprocessed inode. If yes, orphanize the 2800 * another unprocessed inode. If yes, orphanize the
2736 * overwritten inode. If we find an overwritten ref that is 2801 * overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2764 * inode, move it and update valid_path. If not, link or move 2829 * inode, move it and update valid_path. If not, link or move
2765 * it depending on the inode mode. 2830 * it depending on the inode mode.
2766 */ 2831 */
2767 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2832 if (is_orphan) {
2768 ret = send_rename(sctx, valid_path, cur->full_path); 2833 ret = send_rename(sctx, valid_path, cur->full_path);
2769 if (ret < 0) 2834 if (ret < 0)
2770 goto out; 2835 goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2827 if (ret < 0) 2892 if (ret < 0)
2828 goto out; 2893 goto out;
2829 } 2894 }
2895 } else if (S_ISDIR(sctx->cur_inode_mode) &&
2896 !list_empty(&sctx->deleted_refs)) {
2897 /*
2898 * We have a moved dir. Add the old parent to check_dirs
2899 */
2900 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
2901 list);
2902 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2903 GFP_NOFS);
2904 if (ret < 0)
2905 goto out;
2830 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 2906 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2831 /* 2907 /*
2832 * We have a non dir inode. Go through all deleted refs and 2908 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2840 if (ret < 0) 2916 if (ret < 0)
2841 goto out; 2917 goto out;
2842 if (!ret) { 2918 if (!ret) {
2843 /* 2919 ret = send_unlink(sctx, cur->full_path);
2844 * In case the inode was moved to a directory 2920 if (ret < 0)
2845 * that was not created yet (see 2921 goto out;
2846 * __record_new_ref), we can not unlink the ref
2847 * as it will be needed later when the parent
2848 * directory is created, so that we can move in
2849 * the inode to the new dir.
2850 */
2851 if (!is_orphan &&
2852 sctx->cur_inode_first_ref_orphan) {
2853 ret = orphanize_inode(sctx,
2854 sctx->cur_ino,
2855 sctx->cur_inode_gen,
2856 cur->full_path);
2857 if (ret < 0)
2858 goto out;
2859 ret = gen_unique_name(sctx,
2860 sctx->cur_ino,
2861 sctx->cur_inode_gen,
2862 valid_path);
2863 if (ret < 0)
2864 goto out;
2865 is_orphan = 1;
2866
2867 } else {
2868 ret = send_unlink(sctx, cur->full_path);
2869 if (ret < 0)
2870 goto out;
2871 }
2872 } 2922 }
2873 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, 2923 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2874 GFP_NOFS); 2924 GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2880 * If the inode is still orphan, unlink the orphan. This may 2930 * If the inode is still orphan, unlink the orphan. This may
2881 * happen when a previous inode did overwrite the first ref 2931 * happen when a previous inode did overwrite the first ref
2882 * of this inode and no new refs were added for the current 2932 * of this inode and no new refs were added for the current
2883 * inode. 2933 * inode. Unlinking does not mean that the inode is deleted in
2884 * We can however not delete the orphan in case the inode relies 2934 * all cases. There may still be links to this inode in other
2885 * in a directory that was not created yet (see 2935 * places.
2886 * __record_new_ref)
2887 */ 2936 */
2888 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2937 if (is_orphan) {
2889 ret = send_unlink(sctx, valid_path); 2938 ret = send_unlink(sctx, valid_path);
2890 if (ret < 0) 2939 if (ret < 0)
2891 goto out; 2940 goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2900 */ 2949 */
2901 ULIST_ITER_INIT(&uit); 2950 ULIST_ITER_INIT(&uit);
2902 while ((un = ulist_next(check_dirs, &uit))) { 2951 while ((un = ulist_next(check_dirs, &uit))) {
2952 /*
2953 * In case we had refs into dirs that were not processed yet,
2954 * we don't need to do the utime and rmdir logic for these dirs.
2955 * The dir will be processed later.
2956 */
2903 if (un->val > sctx->cur_ino) 2957 if (un->val > sctx->cur_ino)
2904 continue; 2958 continue;
2905 2959
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2929 } 2983 }
2930 } 2984 }
2931 2985
2932 /*
2933 * Current inode is now at it's new position, so we must increase
2934 * send_progress
2935 */
2936 sctx->send_progress = sctx->cur_ino + 1;
2937
2938 /*
2939 * We may have a directory here that has pending refs which could not
2940 * be created before (because the dir did not exist before, see
2941 * __record_new_ref). finish_outoforder_dir will link/move the pending
2942 * refs.
2943 */
2944 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2945 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2946 sctx->cur_inode_gen);
2947 if (ret < 0)
2948 goto out;
2949 }
2950
2951 ret = 0; 2986 ret = 0;
2952 2987
2953out: 2988out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
2971 return -ENOMEM; 3006 return -ENOMEM;
2972 3007
2973 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3008 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2974 NULL); 3009 NULL, NULL);
2975 if (ret < 0)
2976 goto out;
2977
2978 /*
2979 * The parent may be non-existent at this point in time. This happens
2980 * if the ino of the parent dir is higher then the current ino. In this
2981 * case, we can not process this ref until the parent dir is finally
2982 * created. If we reach the parent dir later, process_recorded_refs
2983 * will go through all dir items and process the refs that could not be
2984 * processed before. In case this is the first ref, we set
2985 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2986 * keep an orphan of the inode so that it later can be used for
2987 * link/move
2988 */
2989 ret = is_inode_existent(sctx, dir, gen);
2990 if (ret < 0) 3010 if (ret < 0)
2991 goto out; 3011 goto out;
2992 if (!ret) {
2993 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2994 name->start, fs_path_len(name));
2995 if (ret < 0)
2996 goto out;
2997 if (ret)
2998 sctx->cur_inode_first_ref_orphan = 1;
2999 ret = 0;
3000 goto out;
3001 }
3002 3012
3003 ret = get_cur_path(sctx, dir, gen, p); 3013 ret = get_cur_path(sctx, dir, gen, p);
3004 if (ret < 0) 3014 if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3029 return -ENOMEM; 3039 return -ENOMEM;
3030 3040
3031 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, 3041 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3032 NULL); 3042 NULL, NULL);
3033 if (ret < 0) 3043 if (ret < 0)
3034 goto out; 3044 goto out;
3035 3045
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
3206 key.offset = 0; 3216 key.offset = 0;
3207 while (1) { 3217 while (1) {
3208 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3218 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3209 if (ret < 0) { 3219 if (ret < 0)
3210 btrfs_release_path(path);
3211 goto out; 3220 goto out;
3212 } 3221 if (ret)
3213 if (ret) {
3214 btrfs_release_path(path);
3215 break; 3222 break;
3216 }
3217 3223
3218 eb = path->nodes[0]; 3224 eb = path->nodes[0];
3219 slot = path->slots[0]; 3225 slot = path->slots[0];
3220 btrfs_item_key_to_cpu(eb, &found_key, slot); 3226 btrfs_item_key_to_cpu(eb, &found_key, slot);
3221 3227
3222 if (found_key.objectid != key.objectid || 3228 if (found_key.objectid != key.objectid ||
3223 found_key.type != key.type) { 3229 found_key.type != key.type)
3224 btrfs_release_path(path);
3225 break; 3230 break;
3226 }
3227 3231
3228 ret = iterate_inode_ref(sctx, sctx->parent_root, path, 3232 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
3229 &found_key, 0, cb, sctx); 3233 sctx);
3230 btrfs_release_path(path); 3234 btrfs_release_path(path);
3231 if (ret < 0) 3235 if (ret < 0)
3232 goto out; 3236 goto out;
3233 3237
3234 key.offset = found_key.offset + 1; 3238 key.offset = found_key.offset + 1;
3235 } 3239 }
3240 btrfs_release_path(path);
3236 3241
3237 ret = process_recorded_refs(sctx); 3242 ret = process_recorded_refs(sctx);
3238 3243
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3555 int ret = 0; 3560 int ret = 0;
3556 struct fs_path *p; 3561 struct fs_path *p;
3557 loff_t pos = offset; 3562 loff_t pos = offset;
3558 int readed = 0; 3563 int num_read = 0;
3559 mm_segment_t old_fs; 3564 mm_segment_t old_fs;
3560 3565
3561 p = fs_path_alloc(sctx); 3566 p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3580 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); 3585 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3581 if (ret < 0) 3586 if (ret < 0)
3582 goto out; 3587 goto out;
3583 readed = ret; 3588 num_read = ret;
3584 if (!readed) 3589 if (!num_read)
3585 goto out; 3590 goto out;
3586 3591
3587 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 3592 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3594 3599
3595 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3600 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3596 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 3601 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3597 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); 3602 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
3598 3603
3599 ret = send_cmd(sctx); 3604 ret = send_cmd(sctx);
3600 3605
@@ -3604,7 +3609,7 @@ out:
3604 set_fs(old_fs); 3609 set_fs(old_fs);
3605 if (ret < 0) 3610 if (ret < 0)
3606 return ret; 3611 return ret;
3607 return readed; 3612 return num_read;
3608} 3613}
3609 3614
3610/* 3615/*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
3615 struct clone_root *clone_root) 3620 struct clone_root *clone_root)
3616{ 3621{
3617 int ret = 0; 3622 int ret = 0;
3618 struct btrfs_root *clone_root2 = clone_root->root;
3619 struct fs_path *p; 3623 struct fs_path *p;
3620 u64 gen; 3624 u64 gen;
3621 3625
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3640 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 3644 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3641 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3645 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3642 3646
3643 if (clone_root2 == sctx->send_root) { 3647 if (clone_root->root == sctx->send_root) {
3644 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, 3648 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3645 &gen, NULL, NULL, NULL); 3649 &gen, NULL, NULL, NULL, NULL);
3646 if (ret < 0) 3650 if (ret < 0)
3647 goto out; 3651 goto out;
3648 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3652 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3649 } else { 3653 } else {
3650 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); 3654 ret = get_inode_path(sctx, clone_root->root,
3655 clone_root->ino, p);
3651 } 3656 }
3652 if (ret < 0) 3657 if (ret < 0)
3653 goto out; 3658 goto out;
3654 3659
3655 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 3660 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3656 clone_root2->root_item.uuid); 3661 clone_root->root->root_item.uuid);
3657 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 3662 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3658 clone_root2->root_item.ctransid); 3663 clone_root->root->root_item.ctransid);
3659 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 3664 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3660 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 3665 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3661 clone_root->offset); 3666 clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
3684 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3689 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3685 struct btrfs_file_extent_item); 3690 struct btrfs_file_extent_item);
3686 type = btrfs_file_extent_type(path->nodes[0], ei); 3691 type = btrfs_file_extent_type(path->nodes[0], ei);
3687 if (type == BTRFS_FILE_EXTENT_INLINE) 3692 if (type == BTRFS_FILE_EXTENT_INLINE) {
3688 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 3693 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3689 else 3694 /*
3695 * it is possible the inline item won't cover the whole page,
3696 * but there may be items after this page. Make
3697 * sure to send the whole thing
3698 */
3699 len = PAGE_CACHE_ALIGN(len);
3700 } else {
3690 len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3701 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3702 }
3691 3703
3692 if (offset + len > sctx->cur_inode_size) 3704 if (offset + len > sctx->cur_inode_size)
3693 len = sctx->cur_inode_size - offset; 3705 len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3735 u64 left_offset_fixed; 3747 u64 left_offset_fixed;
3736 u64 left_len; 3748 u64 left_len;
3737 u64 right_len; 3749 u64 right_len;
3750 u64 left_gen;
3751 u64 right_gen;
3738 u8 left_type; 3752 u8 left_type;
3739 u8 right_type; 3753 u8 right_type;
3740 3754
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3744 3758
3745 eb = left_path->nodes[0]; 3759 eb = left_path->nodes[0];
3746 slot = left_path->slots[0]; 3760 slot = left_path->slots[0];
3747
3748 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 3761 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3749 left_type = btrfs_file_extent_type(eb, ei); 3762 left_type = btrfs_file_extent_type(eb, ei);
3750 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3751 left_len = btrfs_file_extent_num_bytes(eb, ei);
3752 left_offset = btrfs_file_extent_offset(eb, ei);
3753 3763
3754 if (left_type != BTRFS_FILE_EXTENT_REG) { 3764 if (left_type != BTRFS_FILE_EXTENT_REG) {
3755 ret = 0; 3765 ret = 0;
3756 goto out; 3766 goto out;
3757 } 3767 }
3768 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3769 left_len = btrfs_file_extent_num_bytes(eb, ei);
3770 left_offset = btrfs_file_extent_offset(eb, ei);
3771 left_gen = btrfs_file_extent_generation(eb, ei);
3758 3772
3759 /* 3773 /*
3760 * Following comments will refer to these graphics. L is the left 3774 * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3810 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 3824 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3811 right_len = btrfs_file_extent_num_bytes(eb, ei); 3825 right_len = btrfs_file_extent_num_bytes(eb, ei);
3812 right_offset = btrfs_file_extent_offset(eb, ei); 3826 right_offset = btrfs_file_extent_offset(eb, ei);
3827 right_gen = btrfs_file_extent_generation(eb, ei);
3813 3828
3814 if (right_type != BTRFS_FILE_EXTENT_REG) { 3829 if (right_type != BTRFS_FILE_EXTENT_REG) {
3815 ret = 0; 3830 ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3820 * Are we at extent 8? If yes, we know the extent is changed. 3835 * Are we at extent 8? If yes, we know the extent is changed.
3821 * This may only happen on the first iteration. 3836 * This may only happen on the first iteration.
3822 */ 3837 */
3823 if (found_key.offset + right_len < ekey->offset) { 3838 if (found_key.offset + right_len <= ekey->offset) {
3824 ret = 0; 3839 ret = 0;
3825 goto out; 3840 goto out;
3826 } 3841 }
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3837 /* 3852 /*
3838 * Check if we have the same extent. 3853 * Check if we have the same extent.
3839 */ 3854 */
3840 if (left_disknr + left_offset_fixed != 3855 if (left_disknr != right_disknr ||
3841 right_disknr + right_offset) { 3856 left_offset_fixed != right_offset ||
3857 left_gen != right_gen) {
3842 ret = 0; 3858 ret = 0;
3843 goto out; 3859 goto out;
3844 } 3860 }
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3977 goto out; 3993 goto out;
3978 3994
3979 ret = process_recorded_refs(sctx); 3995 ret = process_recorded_refs(sctx);
3996 if (ret < 0)
3997 goto out;
3998
3999 /*
4000 * We have processed the refs and thus need to advance send_progress.
4001 * Now, calls to get_cur_xxx will take the updated refs of the current
4002 * inode into account.
4003 */
4004 sctx->send_progress = sctx->cur_ino + 1;
3980 4005
3981out: 4006out:
3982 return ret; 4007 return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4004 goto out; 4029 goto out;
4005 4030
4006 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, 4031 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4007 &left_mode, &left_uid, &left_gid); 4032 &left_mode, &left_uid, &left_gid, NULL);
4008 if (ret < 0) 4033 if (ret < 0)
4009 goto out; 4034 goto out;
4010 4035
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4015 } else { 4040 } else {
4016 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, 4041 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4017 NULL, NULL, &right_mode, &right_uid, 4042 NULL, NULL, &right_mode, &right_uid,
4018 &right_gid); 4043 &right_gid, NULL);
4019 if (ret < 0) 4044 if (ret < 0)
4020 goto out; 4045 goto out;
4021 4046
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
4074 4099
4075 sctx->cur_ino = key->objectid; 4100 sctx->cur_ino = key->objectid;
4076 sctx->cur_inode_new_gen = 0; 4101 sctx->cur_inode_new_gen = 0;
4077 sctx->cur_inode_first_ref_orphan = 0; 4102
4103 /*
4104 * Set send_progress to current inode. This will tell all get_cur_xxx
4105 * functions that the current inode's refs are not updated yet. Later,
4106 * when process_recorded_refs is finished, it is set to cur_ino + 1.
4107 */
4078 sctx->send_progress = sctx->cur_ino; 4108 sctx->send_progress = sctx->cur_ino;
4079 4109
4080 if (result == BTRFS_COMPARE_TREE_NEW || 4110 if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
4098 4128
4099 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 4129 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4100 right_ii); 4130 right_ii);
4101 if (left_gen != right_gen) 4131
4132 /*
4133 * The cur_ino = root dir case is special here. We can't treat
4134 * the inode as deleted+reused because it would generate a
4135 * stream that tries to delete/mkdir the root dir.
4136 */
4137 if (left_gen != right_gen &&
4138 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4102 sctx->cur_inode_new_gen = 1; 4139 sctx->cur_inode_new_gen = 1;
4103 } 4140 }
4104 4141
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
4111 sctx->cur_inode_mode = btrfs_inode_mode( 4148 sctx->cur_inode_mode = btrfs_inode_mode(
4112 sctx->left_path->nodes[0], left_ii); 4149 sctx->left_path->nodes[0], left_ii);
4113 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 4150 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4114 ret = send_create_inode(sctx, sctx->left_path, 4151 ret = send_create_inode_if_needed(sctx);
4115 sctx->cmp_key);
4116 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 4152 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4117 sctx->cur_inode_gen = right_gen; 4153 sctx->cur_inode_gen = right_gen;
4118 sctx->cur_inode_new = 0; 4154 sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
4122 sctx->cur_inode_mode = btrfs_inode_mode( 4158 sctx->cur_inode_mode = btrfs_inode_mode(
4123 sctx->right_path->nodes[0], right_ii); 4159 sctx->right_path->nodes[0], right_ii);
4124 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 4160 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4161 /*
4162 * We need to do some special handling in case the inode was
4163 * reported as changed with a changed generation number. This
4164 * means that the original inode was deleted and new inode
4165 * reused the same inum. So we have to treat the old inode as
4166 * deleted and the new one as new.
4167 */
4125 if (sctx->cur_inode_new_gen) { 4168 if (sctx->cur_inode_new_gen) {
4169 /*
4170 * First, process the inode as if it was deleted.
4171 */
4126 sctx->cur_inode_gen = right_gen; 4172 sctx->cur_inode_gen = right_gen;
4127 sctx->cur_inode_new = 0; 4173 sctx->cur_inode_new = 0;
4128 sctx->cur_inode_deleted = 1; 4174 sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
4135 if (ret < 0) 4181 if (ret < 0)
4136 goto out; 4182 goto out;
4137 4183
4184 /*
4185 * Now process the inode as if it was new.
4186 */
4138 sctx->cur_inode_gen = left_gen; 4187 sctx->cur_inode_gen = left_gen;
4139 sctx->cur_inode_new = 1; 4188 sctx->cur_inode_new = 1;
4140 sctx->cur_inode_deleted = 0; 4189 sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
4142 sctx->left_path->nodes[0], left_ii); 4191 sctx->left_path->nodes[0], left_ii);
4143 sctx->cur_inode_mode = btrfs_inode_mode( 4192 sctx->cur_inode_mode = btrfs_inode_mode(
4144 sctx->left_path->nodes[0], left_ii); 4193 sctx->left_path->nodes[0], left_ii);
4145 ret = send_create_inode(sctx, sctx->left_path, 4194 ret = send_create_inode_if_needed(sctx);
4146 sctx->cmp_key);
4147 if (ret < 0) 4195 if (ret < 0)
4148 goto out; 4196 goto out;
4149 4197
4150 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 4198 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4151 if (ret < 0) 4199 if (ret < 0)
4152 goto out; 4200 goto out;
4201 /*
4202 * Advance send_progress now as we did not get into
4203 * process_recorded_refs_if_needed in the new_gen case.
4204 */
4205 sctx->send_progress = sctx->cur_ino + 1;
4206
4207 /*
4208 * Now process all extents and xattrs of the inode as if
4209 * they were all new.
4210 */
4153 ret = process_all_extents(sctx); 4211 ret = process_all_extents(sctx);
4154 if (ret < 0) 4212 if (ret < 0)
4155 goto out; 4213 goto out;
@@ -4172,6 +4230,16 @@ out:
4172 return ret; 4230 return ret;
4173} 4231}
4174 4232
4233/*
4234 * We have to process new refs before deleted refs, but compare_trees gives us
4235 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
4236 * first and later process them in process_recorded_refs.
4237 * For the cur_inode_new_gen case, we skip recording completely because
4238 * changed_inode did already initiate processing of refs. The reason for this is
4239 * that in this case, compare_tree actually compares the refs of 2 different
4240 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
4241 * refs of the right tree as deleted and all refs of the left tree as new.
4242 */
4175static int changed_ref(struct send_ctx *sctx, 4243static int changed_ref(struct send_ctx *sctx,
4176 enum btrfs_compare_tree_result result) 4244 enum btrfs_compare_tree_result result)
4177{ 4245{
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
4192 return ret; 4260 return ret;
4193} 4261}
4194 4262
4263/*
4264 * Process new/deleted/changed xattrs. We skip processing in the
4265 * cur_inode_new_gen case because changed_inode did already initiate processing
4266 * of xattrs. The reason is the same as in changed_ref
4267 */
4195static int changed_xattr(struct send_ctx *sctx, 4268static int changed_xattr(struct send_ctx *sctx,
4196 enum btrfs_compare_tree_result result) 4269 enum btrfs_compare_tree_result result)
4197{ 4270{
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
4211 return ret; 4284 return ret;
4212} 4285}
4213 4286
4287/*
4288 * Process new/deleted/changed extents. We skip processing in the
4289 * cur_inode_new_gen case because changed_inode did already initiate processing
4290 * of extents. The reason is the same as in changed_ref
4291 */
4214static int changed_extent(struct send_ctx *sctx, 4292static int changed_extent(struct send_ctx *sctx,
4215 enum btrfs_compare_tree_result result) 4293 enum btrfs_compare_tree_result result)
4216{ 4294{
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
4227 return ret; 4305 return ret;
4228} 4306}
4229 4307
4230 4308/*
4309 * Updates compare related fields in sctx and simply forwards to the actual
4310 * changed_xxx functions.
4311 */
4231static int changed_cb(struct btrfs_root *left_root, 4312static int changed_cb(struct btrfs_root *left_root,
4232 struct btrfs_root *right_root, 4313 struct btrfs_root *right_root,
4233 struct btrfs_path *left_path, 4314 struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
4247 if (ret < 0) 4328 if (ret < 0)
4248 goto out; 4329 goto out;
4249 4330
4331 /* Ignore non-FS objects */
4332 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
4333 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
4334 goto out;
4335
4250 if (key->type == BTRFS_INODE_ITEM_KEY) 4336 if (key->type == BTRFS_INODE_ITEM_KEY)
4251 ret = changed_inode(sctx, result); 4337 ret = changed_inode(sctx, result);
4252 else if (key->type == BTRFS_INODE_REF_KEY) 4338 else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
4299 } 4385 }
4300 4386
4301 /* 4387 /*
4302 * Make sure the tree has not changed 4388 * Make sure the tree has not changed after re-joining. We detect this
4389 * by comparing start_ctransid and ctransid. They should always match.
4303 */ 4390 */
4304 spin_lock(&send_root->root_times_lock); 4391 spin_lock(&send_root->root_times_lock);
4305 ctransid = btrfs_root_ctransid(&send_root->root_item); 4392 ctransid = btrfs_root_ctransid(&send_root->root_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
130 130
131#ifdef __KERNEL__ 131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
133#endif 134#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..915ac14c2064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root, const char *function, 243 struct btrfs_root *root, const char *function,
244 unsigned int line, int errno) 244 unsigned int line, int errno)
245{ 245{
246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); 246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
247 trans->aborted = errno; 247 trans->aborted = errno;
248 /* Nothing used. The other threads that have joined this 248 /* Nothing used. The other threads that have joined this
249 * transaction may be able to continue. */ 249 * transaction may be able to continue. */
250 if (!trans->blocks_used) { 250 if (!trans->blocks_used) {
251 btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); 251 char nbuf[16];
252 const char *errstr;
253
254 errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
255 btrfs_printk(root->fs_info,
256 "%s:%d: Aborting unused transaction(%s).\n",
257 function, line, errstr);
252 return; 258 return;
253 } 259 }
254 trans->transaction->aborted = errno; 260 trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
407 btrfs_set_opt(info->mount_opt, NODATASUM); 413 btrfs_set_opt(info->mount_opt, NODATASUM);
408 break; 414 break;
409 case Opt_nodatacow: 415 case Opt_nodatacow:
410 printk(KERN_INFO "btrfs: setting nodatacow\n"); 416 if (!btrfs_test_opt(root, COMPRESS) ||
417 !btrfs_test_opt(root, FORCE_COMPRESS)) {
418 printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
419 } else {
420 printk(KERN_INFO "btrfs: setting nodatacow\n");
421 }
422 info->compress_type = BTRFS_COMPRESS_NONE;
423 btrfs_clear_opt(info->mount_opt, COMPRESS);
424 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
411 btrfs_set_opt(info->mount_opt, NODATACOW); 425 btrfs_set_opt(info->mount_opt, NODATACOW);
412 btrfs_set_opt(info->mount_opt, NODATASUM); 426 btrfs_set_opt(info->mount_opt, NODATASUM);
413 break; 427 break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
422 compress_type = "zlib"; 436 compress_type = "zlib";
423 info->compress_type = BTRFS_COMPRESS_ZLIB; 437 info->compress_type = BTRFS_COMPRESS_ZLIB;
424 btrfs_set_opt(info->mount_opt, COMPRESS); 438 btrfs_set_opt(info->mount_opt, COMPRESS);
439 btrfs_clear_opt(info->mount_opt, NODATACOW);
440 btrfs_clear_opt(info->mount_opt, NODATASUM);
425 } else if (strcmp(args[0].from, "lzo") == 0) { 441 } else if (strcmp(args[0].from, "lzo") == 0) {
426 compress_type = "lzo"; 442 compress_type = "lzo";
427 info->compress_type = BTRFS_COMPRESS_LZO; 443 info->compress_type = BTRFS_COMPRESS_LZO;
428 btrfs_set_opt(info->mount_opt, COMPRESS); 444 btrfs_set_opt(info->mount_opt, COMPRESS);
445 btrfs_clear_opt(info->mount_opt, NODATACOW);
446 btrfs_clear_opt(info->mount_opt, NODATASUM);
429 btrfs_set_fs_incompat(info, COMPRESS_LZO); 447 btrfs_set_fs_incompat(info, COMPRESS_LZO);
430 } else if (strncmp(args[0].from, "no", 2) == 0) { 448 } else if (strncmp(args[0].from, "no", 2) == 0) {
431 compress_type = "no"; 449 compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
543 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 561 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
544 break; 562 break;
545 case Opt_defrag: 563 case Opt_defrag:
546 printk(KERN_INFO "btrfs: enabling auto defrag"); 564 printk(KERN_INFO "btrfs: enabling auto defrag\n");
547 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 565 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
548 break; 566 break;
549 case Opt_recovery: 567 case Opt_recovery:
550 printk(KERN_INFO "btrfs: enabling auto recovery"); 568 printk(KERN_INFO "btrfs: enabling auto recovery\n");
551 btrfs_set_opt(info->mount_opt, RECOVERY); 569 btrfs_set_opt(info->mount_opt, RECOVERY);
552 break; 570 break;
553 case Opt_skip_balance: 571 case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
846 return 0; 864 return 0;
847 } 865 }
848 866
849 btrfs_wait_ordered_extents(root, 0, 0); 867 btrfs_wait_ordered_extents(root, 0);
850
851 spin_lock(&fs_info->trans_lock);
852 if (!fs_info->running_transaction) {
853 spin_unlock(&fs_info->trans_lock);
854 return 0;
855 }
856 spin_unlock(&fs_info->trans_lock);
857 868
858 trans = btrfs_join_transaction(root); 869 trans = btrfs_attach_transaction(root);
859 if (IS_ERR(trans)) 870 if (IS_ERR(trans)) {
871 /* no transaction, don't bother */
872 if (PTR_ERR(trans) == -ENOENT)
873 return 0;
860 return PTR_ERR(trans); 874 return PTR_ERR(trans);
875 }
861 return btrfs_commit_transaction(trans, root); 876 return btrfs_commit_transaction(trans, root);
862} 877}
863 878
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1508 1523
1509static int btrfs_freeze(struct super_block *sb) 1524static int btrfs_freeze(struct super_block *sb)
1510{ 1525{
1511 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1526 struct btrfs_trans_handle *trans;
1512 mutex_lock(&fs_info->transaction_kthread_mutex); 1527 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1513 mutex_lock(&fs_info->cleaner_mutex); 1528
1514 return 0; 1529 trans = btrfs_attach_transaction(root);
1530 if (IS_ERR(trans)) {
1531 /* no transaction, don't bother */
1532 if (PTR_ERR(trans) == -ENOENT)
1533 return 0;
1534 return PTR_ERR(trans);
1535 }
1536 return btrfs_commit_transaction(trans, root);
1515} 1537}
1516 1538
1517static int btrfs_unfreeze(struct super_block *sb) 1539static int btrfs_unfreeze(struct super_block *sb)
1518{ 1540{
1519 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1520 mutex_unlock(&fs_info->cleaner_mutex);
1521 mutex_unlock(&fs_info->transaction_kthread_mutex);
1522 return 0; 1541 return 0;
1523} 1542}
1524 1543
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
1595static void btrfs_interface_exit(void) 1614static void btrfs_interface_exit(void)
1596{ 1615{
1597 if (misc_deregister(&btrfs_misc) < 0) 1616 if (misc_deregister(&btrfs_misc) < 0)
1598 printk(KERN_INFO "misc_deregister failed for control device"); 1617 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1599} 1618}
1600 1619
1601static int __init init_btrfs_fs(void) 1620static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
1620 if (err) 1639 if (err)
1621 goto free_extent_io; 1640 goto free_extent_io;
1622 1641
1623 err = btrfs_delayed_inode_init(); 1642 err = ordered_data_init();
1624 if (err) 1643 if (err)
1625 goto free_extent_map; 1644 goto free_extent_map;
1626 1645
1646 err = btrfs_delayed_inode_init();
1647 if (err)
1648 goto free_ordered_data;
1649
1627 err = btrfs_interface_init(); 1650 err = btrfs_interface_init();
1628 if (err) 1651 if (err)
1629 goto free_delayed_inode; 1652 goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
1641 btrfs_interface_exit(); 1664 btrfs_interface_exit();
1642free_delayed_inode: 1665free_delayed_inode:
1643 btrfs_delayed_inode_exit(); 1666 btrfs_delayed_inode_exit();
1667free_ordered_data:
1668 ordered_data_exit();
1644free_extent_map: 1669free_extent_map:
1645 extent_map_exit(); 1670 extent_map_exit();
1646free_extent_io: 1671free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
1657{ 1682{
1658 btrfs_destroy_cachep(); 1683 btrfs_destroy_cachep();
1659 btrfs_delayed_inode_exit(); 1684 btrfs_delayed_inode_exit();
1685 ordered_data_exit();
1660 extent_map_exit(); 1686 extent_map_exit();
1661 extent_io_exit(); 1687 extent_io_exit();
1662 btrfs_interface_exit(); 1688 btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..77db875b5116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
53/* 53/*
54 * either allocate a new transaction or hop into the existing one 54 * either allocate a new transaction or hop into the existing one
55 */ 55 */
56static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int type)
57{ 57{
58 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info; 59 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
67 } 67 }
68 68
69 if (fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
70 if (!nofail) { 70 /*
71 * If we are JOIN_NOLOCK we're already committing a current
72 * transaction, we just need a handle to deal with something
73 * when committing the transaction, such as inode cache and
74 * space cache. It is a special case.
75 */
76 if (type != TRANS_JOIN_NOLOCK) {
71 spin_unlock(&fs_info->trans_lock); 77 spin_unlock(&fs_info->trans_lock);
72 return -EBUSY; 78 return -EBUSY;
73 } 79 }
@@ -87,6 +93,13 @@ loop:
87 } 93 }
88 spin_unlock(&fs_info->trans_lock); 94 spin_unlock(&fs_info->trans_lock);
89 95
96 /*
97 * If we are ATTACH, we just want to catch the current transaction,
98 * and commit it. If there is no transaction, just return ENOENT.
99 */
100 if (type == TRANS_ATTACH)
101 return -ENOENT;
102
90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 103 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
91 if (!cur_trans) 104 if (!cur_trans)
92 return -ENOMEM; 105 return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
267 } 280 }
268} 281}
269 282
270enum btrfs_trans_type {
271 TRANS_START,
272 TRANS_JOIN,
273 TRANS_USERSPACE,
274 TRANS_JOIN_NOLOCK,
275};
276
277static int may_wait_transaction(struct btrfs_root *root, int type) 283static int may_wait_transaction(struct btrfs_root *root, int type)
278{ 284{
279 if (root->fs_info->log_root_recovering) 285 if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
290} 296}
291 297
292static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
293 u64 num_items, int type) 299 u64 num_items, int type,
300 int noflush)
294{ 301{
295 struct btrfs_trans_handle *h; 302 struct btrfs_trans_handle *h;
296 struct btrfs_transaction *cur_trans; 303 struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
324 } 331 }
325 332
326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327 ret = btrfs_block_rsv_add(root, 334 if (noflush)
328 &root->fs_info->trans_block_rsv, 335 ret = btrfs_block_rsv_add_noflush(root,
329 num_bytes); 336 &root->fs_info->trans_block_rsv,
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
330 if (ret) 342 if (ret)
331 return ERR_PTR(ret); 343 return ERR_PTR(ret);
332 } 344 }
@@ -335,19 +347,34 @@ again:
335 if (!h) 347 if (!h)
336 return ERR_PTR(-ENOMEM); 348 return ERR_PTR(-ENOMEM);
337 349
338 sb_start_intwrite(root->fs_info->sb); 350 /*
351 * If we are JOIN_NOLOCK we're already committing a transaction and
352 * waiting on this guy, so we don't need to do the sb_start_intwrite
353 * because we're already holding a ref. We need this because we could
354 * have raced in and did an fsync() on a file which can kick a commit
355 * and then we deadlock with somebody doing a freeze.
356 *
357 * If we are ATTACH, it means we just want to catch the current
358 * transaction and commit it, so we needn't do sb_start_intwrite().
359 */
360 if (type < TRANS_JOIN_NOLOCK)
361 sb_start_intwrite(root->fs_info->sb);
339 362
340 if (may_wait_transaction(root, type)) 363 if (may_wait_transaction(root, type))
341 wait_current_trans(root); 364 wait_current_trans(root);
342 365
343 do { 366 do {
344 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); 367 ret = join_transaction(root, type);
345 if (ret == -EBUSY) 368 if (ret == -EBUSY)
346 wait_current_trans(root); 369 wait_current_trans(root);
347 } while (ret == -EBUSY); 370 } while (ret == -EBUSY);
348 371
349 if (ret < 0) { 372 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb); 373 /* We must get the transaction if we are JOIN_NOLOCK. */
374 BUG_ON(type == TRANS_JOIN_NOLOCK);
375
376 if (type < TRANS_JOIN_NOLOCK)
377 sb_end_intwrite(root->fs_info->sb);
351 kmem_cache_free(btrfs_trans_handle_cachep, h); 378 kmem_cache_free(btrfs_trans_handle_cachep, h);
352 return ERR_PTR(ret); 379 return ERR_PTR(ret);
353 } 380 }
@@ -367,7 +394,9 @@ again:
367 h->aborted = 0; 394 h->aborted = 0;
368 h->qgroup_reserved = qgroup_reserved; 395 h->qgroup_reserved = qgroup_reserved;
369 h->delayed_ref_elem.seq = 0; 396 h->delayed_ref_elem.seq = 0;
397 h->type = type;
370 INIT_LIST_HEAD(&h->qgroup_ref_list); 398 INIT_LIST_HEAD(&h->qgroup_ref_list);
399 INIT_LIST_HEAD(&h->new_bgs);
371 400
372 smp_mb(); 401 smp_mb();
373 if (cur_trans->blocked && may_wait_transaction(root, type)) { 402 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
393struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
394 int num_items) 423 int num_items)
395{ 424{
396 return start_transaction(root, num_items, TRANS_START); 425 return start_transaction(root, num_items, TRANS_START, 0);
426}
427
428struct btrfs_trans_handle *btrfs_start_transaction_noflush(
429 struct btrfs_root *root, int num_items)
430{
431 return start_transaction(root, num_items, TRANS_START, 1);
397} 432}
433
398struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
399{ 435{
400 return start_transaction(root, 0, TRANS_JOIN); 436 return start_transaction(root, 0, TRANS_JOIN, 0);
401} 437}
402 438
403struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 439struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
404{ 440{
405 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 441 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
406} 442}
407 443
408struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 444struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
409{ 445{
410 return start_transaction(root, 0, TRANS_USERSPACE); 446 return start_transaction(root, 0, TRANS_USERSPACE, 0);
447}
448
449struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
450{
451 return start_transaction(root, 0, TRANS_ATTACH, 0);
411} 452}
412 453
413/* wait for a transaction commit to be fully complete */ 454/* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
506} 547}
507 548
508static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 549static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
509 struct btrfs_root *root, int throttle, int lock) 550 struct btrfs_root *root, int throttle)
510{ 551{
511 struct btrfs_transaction *cur_trans = trans->transaction; 552 struct btrfs_transaction *cur_trans = trans->transaction;
512 struct btrfs_fs_info *info = root->fs_info; 553 struct btrfs_fs_info *info = root->fs_info;
513 int count = 0; 554 int count = 0;
555 int lock = (trans->type != TRANS_JOIN_NOLOCK);
514 int err = 0; 556 int err = 0;
515 557
516 if (--trans->use_count) { 558 if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
536 trans->qgroup_reserved = 0; 578 trans->qgroup_reserved = 0;
537 } 579 }
538 580
581 if (!list_empty(&trans->new_bgs))
582 btrfs_create_pending_block_groups(trans, root);
583
539 while (count < 2) { 584 while (count < 2) {
540 unsigned long cur = trans->delayed_ref_updates; 585 unsigned long cur = trans->delayed_ref_updates;
541 trans->delayed_ref_updates = 0; 586 trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
551 btrfs_trans_release_metadata(trans, root); 596 btrfs_trans_release_metadata(trans, root);
552 trans->block_rsv = NULL; 597 trans->block_rsv = NULL;
553 598
554 sb_end_intwrite(root->fs_info->sb); 599 if (!list_empty(&trans->new_bgs))
600 btrfs_create_pending_block_groups(trans, root);
555 601
556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 602 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
557 should_end_transaction(trans, root)) { 603 should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
573 } 619 }
574 } 620 }
575 621
622 if (trans->type < TRANS_JOIN_NOLOCK)
623 sb_end_intwrite(root->fs_info->sb);
624
576 WARN_ON(cur_trans != info->running_transaction); 625 WARN_ON(cur_trans != info->running_transaction);
577 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 626 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
578 atomic_dec(&cur_trans->num_writers); 627 atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
604{ 653{
605 int ret; 654 int ret;
606 655
607 ret = __btrfs_end_transaction(trans, root, 0, 1); 656 ret = __btrfs_end_transaction(trans, root, 0);
608 if (ret) 657 if (ret)
609 return ret; 658 return ret;
610 return 0; 659 return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
615{ 664{
616 int ret; 665 int ret;
617 666
618 ret = __btrfs_end_transaction(trans, root, 1, 1); 667 ret = __btrfs_end_transaction(trans, root, 1);
619 if (ret)
620 return ret;
621 return 0;
622}
623
624int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root)
626{
627 int ret;
628
629 ret = __btrfs_end_transaction(trans, root, 0, 0);
630 if (ret) 668 if (ret)
631 return ret; 669 return ret;
632 return 0; 670 return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
635int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, 673int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
636 struct btrfs_root *root) 674 struct btrfs_root *root)
637{ 675{
638 return __btrfs_end_transaction(trans, root, 1, 1); 676 return __btrfs_end_transaction(trans, root, 1);
639} 677}
640 678
641/* 679/*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
649 int err = 0; 687 int err = 0;
650 int werr = 0; 688 int werr = 0;
651 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 689 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
690 struct extent_state *cached_state = NULL;
652 u64 start = 0; 691 u64 start = 0;
653 u64 end; 692 u64 end;
654 693
655 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 694 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
656 mark)) { 695 mark, &cached_state)) {
657 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, 696 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
658 GFP_NOFS); 697 mark, &cached_state, GFP_NOFS);
698 cached_state = NULL;
659 err = filemap_fdatawrite_range(mapping, start, end); 699 err = filemap_fdatawrite_range(mapping, start, end);
660 if (err) 700 if (err)
661 werr = err; 701 werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
679 int err = 0; 719 int err = 0;
680 int werr = 0; 720 int werr = 0;
681 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 721 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
722 struct extent_state *cached_state = NULL;
682 u64 start = 0; 723 u64 start = 0;
683 u64 end; 724 u64 end;
684 725
685 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 726 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
686 EXTENT_NEED_WAIT)) { 727 EXTENT_NEED_WAIT, &cached_state)) {
687 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); 728 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
729 0, 0, &cached_state, GFP_NOFS);
688 err = filemap_fdatawait_range(mapping, start, end); 730 err = filemap_fdatawait_range(mapping, start, end);
689 if (err) 731 if (err)
690 werr = err; 732 werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
955 struct btrfs_root *parent_root; 997 struct btrfs_root *parent_root;
956 struct btrfs_block_rsv *rsv; 998 struct btrfs_block_rsv *rsv;
957 struct inode *parent_inode; 999 struct inode *parent_inode;
1000 struct btrfs_path *path;
1001 struct btrfs_dir_item *dir_item;
958 struct dentry *parent; 1002 struct dentry *parent;
959 struct dentry *dentry; 1003 struct dentry *dentry;
960 struct extent_buffer *tmp; 1004 struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 u64 root_flags; 1011 u64 root_flags;
968 uuid_le new_uuid; 1012 uuid_le new_uuid;
969 1013
970 rsv = trans->block_rsv; 1014 path = btrfs_alloc_path();
1015 if (!path) {
1016 ret = pending->error = -ENOMEM;
1017 goto path_alloc_fail;
1018 }
971 1019
972 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1020 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
973 if (!new_root_item) { 1021 if (!new_root_item) {
974 ret = pending->error = -ENOMEM; 1022 ret = pending->error = -ENOMEM;
975 goto fail; 1023 goto root_item_alloc_fail;
976 } 1024 }
977 1025
978 ret = btrfs_find_free_objectid(tree_root, &objectid); 1026 ret = btrfs_find_free_objectid(tree_root, &objectid);
979 if (ret) { 1027 if (ret) {
980 pending->error = ret; 1028 pending->error = ret;
981 goto fail; 1029 goto no_free_objectid;
982 } 1030 }
983 1031
984 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
988 to_reserve); 1036 to_reserve);
989 if (ret) { 1037 if (ret) {
990 pending->error = ret; 1038 pending->error = ret;
991 goto fail; 1039 goto no_free_objectid;
992 } 1040 }
993 } 1041 }
994 1042
995 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, 1043 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
996 objectid, pending->inherit); 1044 objectid, pending->inherit);
997 kfree(pending->inherit);
998 if (ret) { 1045 if (ret) {
999 pending->error = ret; 1046 pending->error = ret;
1000 goto fail; 1047 goto no_free_objectid;
1001 } 1048 }
1002 1049
1003 key.objectid = objectid; 1050 key.objectid = objectid;
1004 key.offset = (u64)-1; 1051 key.offset = (u64)-1;
1005 key.type = BTRFS_ROOT_ITEM_KEY; 1052 key.type = BTRFS_ROOT_ITEM_KEY;
1006 1053
1054 rsv = trans->block_rsv;
1007 trans->block_rsv = &pending->block_rsv; 1055 trans->block_rsv = &pending->block_rsv;
1008 1056
1009 dentry = pending->dentry; 1057 dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1017 */ 1065 */
1018 ret = btrfs_set_inode_index(parent_inode, &index); 1066 ret = btrfs_set_inode_index(parent_inode, &index);
1019 BUG_ON(ret); /* -ENOMEM */ 1067 BUG_ON(ret); /* -ENOMEM */
1020 ret = btrfs_insert_dir_item(trans, parent_root, 1068
1021 dentry->d_name.name, dentry->d_name.len, 1069 /* check if there is a file/dir which has the same name. */
1022 parent_inode, &key, 1070 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1023 BTRFS_FT_DIR, index); 1071 btrfs_ino(parent_inode),
1024 if (ret == -EEXIST) { 1072 dentry->d_name.name,
1073 dentry->d_name.len, 0);
1074 if (dir_item != NULL && !IS_ERR(dir_item)) {
1025 pending->error = -EEXIST; 1075 pending->error = -EEXIST;
1026 dput(parent);
1027 goto fail; 1076 goto fail;
1028 } else if (ret) { 1077 } else if (IS_ERR(dir_item)) {
1029 goto abort_trans_dput; 1078 ret = PTR_ERR(dir_item);
1079 btrfs_abort_transaction(trans, root, ret);
1080 goto fail;
1030 } 1081 }
1031 1082 btrfs_release_path(path);
1032 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1033 dentry->d_name.len * 2);
1034 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1035 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1036 if (ret)
1037 goto abort_trans_dput;
1038 1083
1039 /* 1084 /*
1040 * pull in the delayed directory update 1085 * pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1043 * snapshot 1088 * snapshot
1044 */ 1089 */
1045 ret = btrfs_run_delayed_items(trans, root); 1090 ret = btrfs_run_delayed_items(trans, root);
1046 if (ret) { /* Transaction aborted */ 1091 if (ret) { /* Transaction aborted */
1047 dput(parent); 1092 btrfs_abort_transaction(trans, root, ret);
1048 goto fail; 1093 goto fail;
1049 } 1094 }
1050 1095
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1079 if (ret) { 1124 if (ret) {
1080 btrfs_tree_unlock(old); 1125 btrfs_tree_unlock(old);
1081 free_extent_buffer(old); 1126 free_extent_buffer(old);
1082 goto abort_trans_dput; 1127 btrfs_abort_transaction(trans, root, ret);
1128 goto fail;
1083 } 1129 }
1084 1130
1085 btrfs_set_lock_blocking(old); 1131 btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1088 /* clean up in any case */ 1134 /* clean up in any case */
1089 btrfs_tree_unlock(old); 1135 btrfs_tree_unlock(old);
1090 free_extent_buffer(old); 1136 free_extent_buffer(old);
1091 if (ret) 1137 if (ret) {
1092 goto abort_trans_dput; 1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140 }
1093 1141
1094 /* see comments in should_cow_block() */ 1142 /* see comments in should_cow_block() */
1095 root->force_cow = 1; 1143 root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1101 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1149 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1102 btrfs_tree_unlock(tmp); 1150 btrfs_tree_unlock(tmp);
1103 free_extent_buffer(tmp); 1151 free_extent_buffer(tmp);
1104 if (ret) 1152 if (ret) {
1105 goto abort_trans_dput; 1153 btrfs_abort_transaction(trans, root, ret);
1154 goto fail;
1155 }
1106 1156
1107 /* 1157 /*
1108 * insert root back/forward references 1158 * insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1111 parent_root->root_key.objectid, 1161 parent_root->root_key.objectid,
1112 btrfs_ino(parent_inode), index, 1162 btrfs_ino(parent_inode), index,
1113 dentry->d_name.name, dentry->d_name.len); 1163 dentry->d_name.name, dentry->d_name.len);
1114 dput(parent); 1164 if (ret) {
1115 if (ret) 1165 btrfs_abort_transaction(trans, root, ret);
1116 goto fail; 1166 goto fail;
1167 }
1117 1168
1118 key.offset = (u64)-1; 1169 key.offset = (u64)-1;
1119 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1170 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1120 if (IS_ERR(pending->snap)) { 1171 if (IS_ERR(pending->snap)) {
1121 ret = PTR_ERR(pending->snap); 1172 ret = PTR_ERR(pending->snap);
1122 goto abort_trans; 1173 btrfs_abort_transaction(trans, root, ret);
1174 goto fail;
1123 } 1175 }
1124 1176
1125 ret = btrfs_reloc_post_snapshot(trans, pending); 1177 ret = btrfs_reloc_post_snapshot(trans, pending);
1178 if (ret) {
1179 btrfs_abort_transaction(trans, root, ret);
1180 goto fail;
1181 }
1182
1183 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1184 if (ret) {
1185 btrfs_abort_transaction(trans, root, ret);
1186 goto fail;
1187 }
1188
1189 ret = btrfs_insert_dir_item(trans, parent_root,
1190 dentry->d_name.name, dentry->d_name.len,
1191 parent_inode, &key,
1192 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST);
1195 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret);
1197 goto fail;
1198 }
1199
1200 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1201 dentry->d_name.len * 2);
1202 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1203 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1126 if (ret) 1204 if (ret)
1127 goto abort_trans; 1205 btrfs_abort_transaction(trans, root, ret);
1128 ret = 0;
1129fail: 1206fail:
1130 kfree(new_root_item); 1207 dput(parent);
1131 trans->block_rsv = rsv; 1208 trans->block_rsv = rsv;
1209no_free_objectid:
1210 kfree(new_root_item);
1211root_item_alloc_fail:
1212 btrfs_free_path(path);
1213path_alloc_fail:
1132 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1214 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1133 return ret; 1215 return ret;
1134
1135abort_trans_dput:
1136 dput(parent);
1137abort_trans:
1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140} 1216}
1141 1217
1142/* 1218/*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
1229 struct btrfs_async_commit *ac = 1305 struct btrfs_async_commit *ac =
1230 container_of(work, struct btrfs_async_commit, work.work); 1306 container_of(work, struct btrfs_async_commit, work.work);
1231 1307
1308 /*
1309 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it.
1311 */
1312 rwsem_acquire_read(
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315
1316 current->journal_info = ac->newtrans;
1317
1232 btrfs_commit_transaction(ac->newtrans, ac->root); 1318 btrfs_commit_transaction(ac->newtrans, ac->root);
1233 kfree(ac); 1319 kfree(ac);
1234} 1320}
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1258 atomic_inc(&cur_trans->use_count); 1344 atomic_inc(&cur_trans->use_count);
1259 1345
1260 btrfs_end_transaction(trans, root); 1346 btrfs_end_transaction(trans, root);
1347
1348 /*
1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it.
1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1353 1, _THIS_IP_);
1354
1261 schedule_delayed_work(&ac->work, 0); 1355 schedule_delayed_work(&ac->work, 0);
1262 1356
1263 /* wait for transaction to start and unblock */ 1357 /* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1348 */ 1442 */
1349 cur_trans->delayed_refs.flushing = 1; 1443 cur_trans->delayed_refs.flushing = 1;
1350 1444
1445 if (!list_empty(&trans->new_bgs))
1446 btrfs_create_pending_block_groups(trans, root);
1447
1351 ret = btrfs_run_delayed_refs(trans, root, 0); 1448 ret = btrfs_run_delayed_refs(trans, root, 0);
1352 if (ret) 1449 if (ret)
1353 goto cleanup_transaction; 1450 goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1403 1500
1404 if (flush_on_commit || snap_pending) { 1501 if (flush_on_commit || snap_pending) {
1405 btrfs_start_delalloc_inodes(root, 1); 1502 btrfs_start_delalloc_inodes(root, 1);
1406 btrfs_wait_ordered_extents(root, 0, 1); 1503 btrfs_wait_ordered_extents(root, 1);
1407 } 1504 }
1408 1505
1409 ret = btrfs_run_delayed_items(trans, root); 1506 ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1456 */ 1553 */
1457 mutex_lock(&root->fs_info->reloc_mutex); 1554 mutex_lock(&root->fs_info->reloc_mutex);
1458 1555
1459 ret = btrfs_run_delayed_items(trans, root); 1556 /*
1557 * We needn't worry about the delayed items because we will
1558 * deal with them in create_pending_snapshot(), which is the
1559 * core function of the snapshot creation.
1560 */
1561 ret = create_pending_snapshots(trans, root->fs_info);
1460 if (ret) { 1562 if (ret) {
1461 mutex_unlock(&root->fs_info->reloc_mutex); 1563 mutex_unlock(&root->fs_info->reloc_mutex);
1462 goto cleanup_transaction; 1564 goto cleanup_transaction;
1463 } 1565 }
1464 1566
1465 ret = create_pending_snapshots(trans, root->fs_info); 1567 /*
1568 * We insert the dir indexes of the snapshots and update the inode
1569 * of the snapshots' parents after the snapshot creation, so there
1570 * are some delayed items which are not dealt with. Now deal with
1571 * them.
1572 *
1573 * We needn't worry that this operation will corrupt the snapshots,
1574 * because all the tree which are snapshoted will be forced to COW
1575 * the nodes and leaves.
1576 */
1577 ret = btrfs_run_delayed_items(trans, root);
1466 if (ret) { 1578 if (ret) {
1467 mutex_unlock(&root->fs_info->reloc_mutex); 1579 mutex_unlock(&root->fs_info->reloc_mutex);
1468 goto cleanup_transaction; 1580 goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1584 put_transaction(cur_trans); 1696 put_transaction(cur_trans);
1585 put_transaction(cur_trans); 1697 put_transaction(cur_trans);
1586 1698
1587 sb_end_intwrite(root->fs_info->sb); 1699 if (trans->type < TRANS_JOIN_NOLOCK)
1700 sb_end_intwrite(root->fs_info->sb);
1588 1701
1589 trace_btrfs_transaction_commit(root); 1702 trace_btrfs_transaction_commit(root);
1590 1703
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..80961947a6b2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
47 int aborted; 47 int aborted;
48}; 48};
49 49
50enum btrfs_trans_type {
51 TRANS_START,
52 TRANS_JOIN,
53 TRANS_USERSPACE,
54 TRANS_JOIN_NOLOCK,
55 TRANS_ATTACH,
56};
57
50struct btrfs_trans_handle { 58struct btrfs_trans_handle {
51 u64 transid; 59 u64 transid;
52 u64 bytes_reserved; 60 u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
58 struct btrfs_transaction *transaction; 66 struct btrfs_transaction *transaction;
59 struct btrfs_block_rsv *block_rsv; 67 struct btrfs_block_rsv *block_rsv;
60 struct btrfs_block_rsv *orig_rsv; 68 struct btrfs_block_rsv *orig_rsv;
61 int aborted; 69 short aborted;
62 int adding_csums; 70 short adding_csums;
71 enum btrfs_trans_type type;
63 /* 72 /*
64 * this root is only needed to validate that the root passed to 73 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction. 74 * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
68 struct btrfs_root *root; 77 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem; 78 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list; 79 struct list_head qgroup_ref_list;
80 struct list_head new_bgs;
71}; 81};
72 82
73struct btrfs_pending_snapshot { 83struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
88{ 98{
89 BTRFS_I(inode)->last_trans = trans->transaction->transid; 99 BTRFS_I(inode)->last_trans = trans->transaction->transid;
90 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 100 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
101 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
91} 102}
92 103
93int btrfs_end_transaction(struct btrfs_trans_handle *trans, 104int btrfs_end_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 105 struct btrfs_root *root);
95int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root);
97struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
98 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush(
109 struct btrfs_root *root, int num_items);
99struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
100struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
101struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
102int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
103int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..81e407d9677a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list_sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "transaction.h" 23#include "transaction.h"
23#include "disk-io.h" 24#include "disk-io.h"
24#include "locking.h" 25#include "locking.h"
25#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h"
26#include "compat.h" 28#include "compat.h"
27#include "tree-log.h" 29#include "tree-log.h"
30#include "hash.h"
28 31
29/* magic values for the inode_only field in btrfs_log_inode: 32/* magic values for the inode_only field in btrfs_log_inode:
30 * 33 *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
146 root->log_multiple_pids = true; 149 root->log_multiple_pids = true;
147 } 150 }
148 151
149 root->log_batch++; 152 atomic_inc(&root->log_batch);
150 atomic_inc(&root->log_writers); 153 atomic_inc(&root->log_writers);
151 mutex_unlock(&root->log_mutex); 154 mutex_unlock(&root->log_mutex);
152 return 0; 155 return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
165 err = ret; 168 err = ret;
166 } 169 }
167 mutex_unlock(&root->fs_info->tree_log_mutex); 170 mutex_unlock(&root->fs_info->tree_log_mutex);
168 root->log_batch++; 171 atomic_inc(&root->log_batch);
169 atomic_inc(&root->log_writers); 172 atomic_inc(&root->log_writers);
170 mutex_unlock(&root->log_mutex); 173 mutex_unlock(&root->log_mutex);
171 return err; 174 return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
484 int found_type; 487 int found_type;
485 u64 mask = root->sectorsize - 1; 488 u64 mask = root->sectorsize - 1;
486 u64 extent_end; 489 u64 extent_end;
487 u64 alloc_hint;
488 u64 start = key->offset; 490 u64 start = key->offset;
489 u64 saved_nbytes; 491 u64 saved_nbytes;
490 struct btrfs_file_extent_item *item; 492 struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550 552
551 saved_nbytes = inode_get_bytes(inode); 553 saved_nbytes = inode_get_bytes(inode);
552 /* drop any overlapping extents */ 554 /* drop any overlapping extents */
553 ret = btrfs_drop_extents(trans, inode, start, extent_end, 555 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554 &alloc_hint, 1);
555 BUG_ON(ret); 556 BUG_ON(ret);
556 557
557 if (found_type == BTRFS_FILE_EXTENT_REG || 558 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
744 */ 745 */
745static noinline int backref_in_log(struct btrfs_root *log, 746static noinline int backref_in_log(struct btrfs_root *log,
746 struct btrfs_key *key, 747 struct btrfs_key *key,
748 u64 ref_objectid,
747 char *name, int namelen) 749 char *name, int namelen)
748{ 750{
749 struct btrfs_path *path; 751 struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
764 if (ret != 0) 766 if (ret != 0)
765 goto out; 767 goto out;
766 768
767 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
768 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 769 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
770
771 if (key->type == BTRFS_INODE_EXTREF_KEY) {
772 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
773 name, namelen, NULL))
774 match = 1;
775
776 goto out;
777 }
778
779 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
769 ptr_end = ptr + item_size; 780 ptr_end = ptr + item_size;
770 while (ptr < ptr_end) { 781 while (ptr < ptr_end) {
771 ref = (struct btrfs_inode_ref *)ptr; 782 ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
786 return match; 797 return match;
787} 798}
788 799
789 800static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
790/*
791 * replay one inode back reference item found in the log tree.
792 * eb, slot and key refer to the buffer and key found in the log tree.
793 * root is the destination we are replaying into, and path is for temp
794 * use by this function. (it should be released on return).
795 */
796static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 struct btrfs_root *root, 801 struct btrfs_root *root,
798 struct btrfs_root *log,
799 struct btrfs_path *path, 802 struct btrfs_path *path,
800 struct extent_buffer *eb, int slot, 803 struct btrfs_root *log_root,
801 struct btrfs_key *key) 804 struct inode *dir, struct inode *inode,
805 struct extent_buffer *eb,
806 u64 inode_objectid, u64 parent_objectid,
807 u64 ref_index, char *name, int namelen,
808 int *search_done)
802{ 809{
803 struct btrfs_inode_ref *ref;
804 struct btrfs_dir_item *di;
805 struct inode *dir;
806 struct inode *inode;
807 unsigned long ref_ptr;
808 unsigned long ref_end;
809 char *name;
810 int namelen;
811 int ret; 810 int ret;
812 int search_done = 0; 811 char *victim_name;
813 812 int victim_name_len;
814 /* 813 struct extent_buffer *leaf;
815 * it is possible that we didn't log all the parent directories 814 struct btrfs_dir_item *di;
816 * for a given inode. If we don't find the dir, just don't 815 struct btrfs_key search_key;
817 * copy the back ref in. The link count fixup code will take 816 struct btrfs_inode_extref *extref;
818 * care of the rest
819 */
820 dir = read_one_inode(root, key->offset);
821 if (!dir)
822 return -ENOENT;
823
824 inode = read_one_inode(root, key->objectid);
825 if (!inode) {
826 iput(dir);
827 return -EIO;
828 }
829
830 ref_ptr = btrfs_item_ptr_offset(eb, slot);
831 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
832 817
833again: 818again:
834 ref = (struct btrfs_inode_ref *)ref_ptr; 819 /* Search old style refs */
835 820 search_key.objectid = inode_objectid;
836 namelen = btrfs_inode_ref_name_len(eb, ref); 821 search_key.type = BTRFS_INODE_REF_KEY;
837 name = kmalloc(namelen, GFP_NOFS); 822 search_key.offset = parent_objectid;
838 BUG_ON(!name); 823 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
839
840 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
841
842 /* if we already have a perfect match, we're done */
843 if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
844 btrfs_inode_ref_index(eb, ref),
845 name, namelen)) {
846 goto out;
847 }
848
849 /*
850 * look for a conflicting back reference in the metadata.
851 * if we find one we have to unlink that name of the file
852 * before we add our new link. Later on, we overwrite any
853 * existing back reference, and we don't want to create
854 * dangling pointers in the directory.
855 */
856
857 if (search_done)
858 goto insert;
859
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) { 824 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref; 825 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr; 826 unsigned long ptr;
866 unsigned long ptr_end; 827 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0]; 828
829 leaf = path->nodes[0];
868 830
869 /* are we trying to overwrite a back ref for the root directory 831 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done 832 * if so, just jump out, we're done
871 */ 833 */
872 if (key->objectid == key->offset) 834 if (search_key.objectid == search_key.offset)
873 goto out_nowrite; 835 return 1;
874 836
875 /* check all the names in this back reference to see 837 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay 838 * if they are in the log. if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
889 (unsigned long)(victim_ref + 1), 851 (unsigned long)(victim_ref + 1),
890 victim_name_len); 852 victim_name_len);
891 853
892 if (!backref_in_log(log, key, victim_name, 854 if (!backref_in_log(log_root, &search_key,
855 parent_objectid,
856 victim_name,
893 victim_name_len)) { 857 victim_name_len)) {
894 btrfs_inc_nlink(inode); 858 btrfs_inc_nlink(inode);
895 btrfs_release_path(path); 859 btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
897 ret = btrfs_unlink_inode(trans, root, dir, 861 ret = btrfs_unlink_inode(trans, root, dir,
898 inode, victim_name, 862 inode, victim_name,
899 victim_name_len); 863 victim_name_len);
864 BUG_ON(ret);
900 btrfs_run_delayed_items(trans, root); 865 btrfs_run_delayed_items(trans, root);
866 kfree(victim_name);
867 *search_done = 1;
868 goto again;
901 } 869 }
902 kfree(victim_name); 870 kfree(victim_name);
871
903 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 872 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
904 } 873 }
905 BUG_ON(ret); 874 BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
908 * NOTE: we have searched root tree and checked the 877 * NOTE: we have searched root tree and checked the
909 * coresponding ref, it does not need to check again. 878 * coresponding ref, it does not need to check again.
910 */ 879 */
911 search_done = 1; 880 *search_done = 1;
881 }
882 btrfs_release_path(path);
883
884 /* Same search but for extended refs */
885 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
886 inode_objectid, parent_objectid, 0,
887 0);
888 if (!IS_ERR_OR_NULL(extref)) {
889 u32 item_size;
890 u32 cur_offset = 0;
891 unsigned long base;
892 struct inode *victim_parent;
893
894 leaf = path->nodes[0];
895
896 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
897 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
898
899 while (cur_offset < item_size) {
900 extref = (struct btrfs_inode_extref *)base + cur_offset;
901
902 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
903
904 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
905 goto next;
906
907 victim_name = kmalloc(victim_name_len, GFP_NOFS);
908 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
909 victim_name_len);
910
911 search_key.objectid = inode_objectid;
912 search_key.type = BTRFS_INODE_EXTREF_KEY;
913 search_key.offset = btrfs_extref_hash(parent_objectid,
914 victim_name,
915 victim_name_len);
916 ret = 0;
917 if (!backref_in_log(log_root, &search_key,
918 parent_objectid, victim_name,
919 victim_name_len)) {
920 ret = -ENOENT;
921 victim_parent = read_one_inode(root,
922 parent_objectid);
923 if (victim_parent) {
924 btrfs_inc_nlink(inode);
925 btrfs_release_path(path);
926
927 ret = btrfs_unlink_inode(trans, root,
928 victim_parent,
929 inode,
930 victim_name,
931 victim_name_len);
932 btrfs_run_delayed_items(trans, root);
933 }
934 BUG_ON(ret);
935 iput(victim_parent);
936 kfree(victim_name);
937 *search_done = 1;
938 goto again;
939 }
940 kfree(victim_name);
941 BUG_ON(ret);
942next:
943 cur_offset += victim_name_len + sizeof(*extref);
944 }
945 *search_done = 1;
912 } 946 }
913 btrfs_release_path(path); 947 btrfs_release_path(path);
914 948
915 /* look for a conflicting sequence number */ 949 /* look for a conflicting sequence number */
916 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 950 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
917 btrfs_inode_ref_index(eb, ref), 951 ref_index, name, namelen, 0);
918 name, namelen, 0);
919 if (di && !IS_ERR(di)) { 952 if (di && !IS_ERR(di)) {
920 ret = drop_one_dir_item(trans, root, path, dir, di); 953 ret = drop_one_dir_item(trans, root, path, dir, di);
921 BUG_ON(ret); 954 BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
931 } 964 }
932 btrfs_release_path(path); 965 btrfs_release_path(path);
933 966
934insert: 967 return 0;
935 /* insert our name */ 968}
936 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
937 btrfs_inode_ref_index(eb, ref));
938 BUG_ON(ret);
939 969
940 btrfs_update_inode(trans, root, inode); 970static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
971 u32 *namelen, char **name, u64 *index,
972 u64 *parent_objectid)
973{
974 struct btrfs_inode_extref *extref;
941 975
942out: 976 extref = (struct btrfs_inode_extref *)ref_ptr;
943 ref_ptr = (unsigned long)(ref + 1) + namelen; 977
944 kfree(name); 978 *namelen = btrfs_inode_extref_name_len(eb, extref);
945 if (ref_ptr < ref_end) 979 *name = kmalloc(*namelen, GFP_NOFS);
946 goto again; 980 if (*name == NULL)
981 return -ENOMEM;
982
983 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
984 *namelen);
985
986 *index = btrfs_inode_extref_index(eb, extref);
987 if (parent_objectid)
988 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
989
990 return 0;
991}
992
993static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
994 u32 *namelen, char **name, u64 *index)
995{
996 struct btrfs_inode_ref *ref;
997
998 ref = (struct btrfs_inode_ref *)ref_ptr;
999
1000 *namelen = btrfs_inode_ref_name_len(eb, ref);
1001 *name = kmalloc(*namelen, GFP_NOFS);
1002 if (*name == NULL)
1003 return -ENOMEM;
1004
1005 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1006
1007 *index = btrfs_inode_ref_index(eb, ref);
1008
1009 return 0;
1010}
1011
1012/*
1013 * replay one inode back reference item found in the log tree.
1014 * eb, slot and key refer to the buffer and key found in the log tree.
1015 * root is the destination we are replaying into, and path is for temp
1016 * use by this function. (it should be released on return).
1017 */
1018static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root,
1020 struct btrfs_root *log,
1021 struct btrfs_path *path,
1022 struct extent_buffer *eb, int slot,
1023 struct btrfs_key *key)
1024{
1025 struct inode *dir;
1026 struct inode *inode;
1027 unsigned long ref_ptr;
1028 unsigned long ref_end;
1029 char *name;
1030 int namelen;
1031 int ret;
1032 int search_done = 0;
1033 int log_ref_ver = 0;
1034 u64 parent_objectid;
1035 u64 inode_objectid;
1036 u64 ref_index = 0;
1037 int ref_struct_size;
1038
1039 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1040 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1041
1042 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1043 struct btrfs_inode_extref *r;
1044
1045 ref_struct_size = sizeof(struct btrfs_inode_extref);
1046 log_ref_ver = 1;
1047 r = (struct btrfs_inode_extref *)ref_ptr;
1048 parent_objectid = btrfs_inode_extref_parent(eb, r);
1049 } else {
1050 ref_struct_size = sizeof(struct btrfs_inode_ref);
1051 parent_objectid = key->offset;
1052 }
1053 inode_objectid = key->objectid;
1054
1055 /*
1056 * it is possible that we didn't log all the parent directories
1057 * for a given inode. If we don't find the dir, just don't
1058 * copy the back ref in. The link count fixup code will take
1059 * care of the rest
1060 */
1061 dir = read_one_inode(root, parent_objectid);
1062 if (!dir)
1063 return -ENOENT;
1064
1065 inode = read_one_inode(root, inode_objectid);
1066 if (!inode) {
1067 iput(dir);
1068 return -EIO;
1069 }
1070
1071 while (ref_ptr < ref_end) {
1072 if (log_ref_ver) {
1073 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1074 &ref_index, &parent_objectid);
1075 /*
1076 * parent object can change from one array
1077 * item to another.
1078 */
1079 if (!dir)
1080 dir = read_one_inode(root, parent_objectid);
1081 if (!dir)
1082 return -ENOENT;
1083 } else {
1084 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1085 &ref_index);
1086 }
1087 if (ret)
1088 return ret;
1089
1090 /* if we already have a perfect match, we're done */
1091 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1092 ref_index, name, namelen)) {
1093 /*
1094 * look for a conflicting back reference in the
1095 * metadata. if we find one we have to unlink that name
1096 * of the file before we add our new link. Later on, we
1097 * overwrite any existing back reference, and we don't
1098 * want to create dangling pointers in the directory.
1099 */
1100
1101 if (!search_done) {
1102 ret = __add_inode_ref(trans, root, path, log,
1103 dir, inode, eb,
1104 inode_objectid,
1105 parent_objectid,
1106 ref_index, name, namelen,
1107 &search_done);
1108 if (ret == 1)
1109 goto out;
1110 BUG_ON(ret);
1111 }
1112
1113 /* insert our name */
1114 ret = btrfs_add_link(trans, dir, inode, name, namelen,
1115 0, ref_index);
1116 BUG_ON(ret);
1117
1118 btrfs_update_inode(trans, root, inode);
1119 }
1120
1121 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1122 kfree(name);
1123 if (log_ref_ver) {
1124 iput(dir);
1125 dir = NULL;
1126 }
1127 }
947 1128
948 /* finally write the back reference in the inode */ 1129 /* finally write the back reference in the inode */
949 ret = overwrite_item(trans, root, path, eb, slot, key); 1130 ret = overwrite_item(trans, root, path, eb, slot, key);
950 BUG_ON(ret); 1131 BUG_ON(ret);
951 1132
952out_nowrite: 1133out:
953 btrfs_release_path(path); 1134 btrfs_release_path(path);
954 iput(dir); 1135 iput(dir);
955 iput(inode); 1136 iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
966 return ret; 1147 return ret;
967} 1148}
968 1149
1150static int count_inode_extrefs(struct btrfs_root *root,
1151 struct inode *inode, struct btrfs_path *path)
1152{
1153 int ret = 0;
1154 int name_len;
1155 unsigned int nlink = 0;
1156 u32 item_size;
1157 u32 cur_offset = 0;
1158 u64 inode_objectid = btrfs_ino(inode);
1159 u64 offset = 0;
1160 unsigned long ptr;
1161 struct btrfs_inode_extref *extref;
1162 struct extent_buffer *leaf;
969 1163
970/* 1164 while (1) {
971 * There are a few corners where the link count of the file can't 1165 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
972 * be properly maintained during replay. So, instead of adding 1166 &extref, &offset);
973 * lots of complexity to the log code, we just scan the backrefs 1167 if (ret)
974 * for any file that has been through replay. 1168 break;
975 * 1169
976 * The scan will update the link count on the inode to reflect the 1170 leaf = path->nodes[0];
977 * number of back refs found. If it goes down to zero, the iput 1171 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
978 * will free the inode. 1172 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
979 */ 1173
980static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1174 while (cur_offset < item_size) {
981 struct btrfs_root *root, 1175 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
982 struct inode *inode) 1176 name_len = btrfs_inode_extref_name_len(leaf, extref);
1177
1178 nlink++;
1179
1180 cur_offset += name_len + sizeof(*extref);
1181 }
1182
1183 offset++;
1184 btrfs_release_path(path);
1185 }
1186 btrfs_release_path(path);
1187
1188 if (ret < 0)
1189 return ret;
1190 return nlink;
1191}
1192
1193static int count_inode_refs(struct btrfs_root *root,
1194 struct inode *inode, struct btrfs_path *path)
983{ 1195{
984 struct btrfs_path *path;
985 int ret; 1196 int ret;
986 struct btrfs_key key; 1197 struct btrfs_key key;
987 u64 nlink = 0; 1198 unsigned int nlink = 0;
988 unsigned long ptr; 1199 unsigned long ptr;
989 unsigned long ptr_end; 1200 unsigned long ptr_end;
990 int name_len; 1201 int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
994 key.type = BTRFS_INODE_REF_KEY; 1205 key.type = BTRFS_INODE_REF_KEY;
995 key.offset = (u64)-1; 1206 key.offset = (u64)-1;
996 1207
997 path = btrfs_alloc_path();
998 if (!path)
999 return -ENOMEM;
1000
1001 while (1) { 1208 while (1) {
1002 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1209 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003 if (ret < 0) 1210 if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1031 btrfs_release_path(path); 1238 btrfs_release_path(path);
1032 } 1239 }
1033 btrfs_release_path(path); 1240 btrfs_release_path(path);
1241
1242 return nlink;
1243}
1244
1245/*
1246 * There are a few corners where the link count of the file can't
1247 * be properly maintained during replay. So, instead of adding
1248 * lots of complexity to the log code, we just scan the backrefs
1249 * for any file that has been through replay.
1250 *
1251 * The scan will update the link count on the inode to reflect the
1252 * number of back refs found. If it goes down to zero, the iput
1253 * will free the inode.
1254 */
1255static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1256 struct btrfs_root *root,
1257 struct inode *inode)
1258{
1259 struct btrfs_path *path;
1260 int ret;
1261 u64 nlink = 0;
1262 u64 ino = btrfs_ino(inode);
1263
1264 path = btrfs_alloc_path();
1265 if (!path)
1266 return -ENOMEM;
1267
1268 ret = count_inode_refs(root, inode, path);
1269 if (ret < 0)
1270 goto out;
1271
1272 nlink = ret;
1273
1274 ret = count_inode_extrefs(root, inode, path);
1275 if (ret == -ENOENT)
1276 ret = 0;
1277
1278 if (ret < 0)
1279 goto out;
1280
1281 nlink += ret;
1282
1283 ret = 0;
1284
1034 if (nlink != inode->i_nlink) { 1285 if (nlink != inode->i_nlink) {
1035 set_nlink(inode, nlink); 1286 set_nlink(inode, nlink);
1036 btrfs_update_inode(trans, root, inode); 1287 btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1046 ret = insert_orphan_item(trans, root, ino); 1297 ret = insert_orphan_item(trans, root, ino);
1047 BUG_ON(ret); 1298 BUG_ON(ret);
1048 } 1299 }
1049 btrfs_free_path(path);
1050 1300
1051 return 0; 1301out:
1302 btrfs_free_path(path);
1303 return ret;
1052} 1304}
1053 1305
1054static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1306static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1695 ret = add_inode_ref(wc->trans, root, log, path, 1947 ret = add_inode_ref(wc->trans, root, log, path,
1696 eb, i, &key); 1948 eb, i, &key);
1697 BUG_ON(ret && ret != -ENOENT); 1949 BUG_ON(ret && ret != -ENOENT);
1950 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1951 ret = add_inode_ref(wc->trans, root, log, path,
1952 eb, i, &key);
1953 BUG_ON(ret && ret != -ENOENT);
1698 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1954 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1699 ret = replay_one_extent(wc->trans, root, path, 1955 ret = replay_one_extent(wc->trans, root, path,
1700 eb, i, &key); 1956 eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2037 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2293 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2038 wait_log_commit(trans, root, root->log_transid - 1); 2294 wait_log_commit(trans, root, root->log_transid - 1);
2039 while (1) { 2295 while (1) {
2040 unsigned long batch = root->log_batch; 2296 int batch = atomic_read(&root->log_batch);
2041 /* when we're on an ssd, just kick the log commit out */ 2297 /* when we're on an ssd, just kick the log commit out */
2042 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2298 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2043 mutex_unlock(&root->log_mutex); 2299 mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2045 mutex_lock(&root->log_mutex); 2301 mutex_lock(&root->log_mutex);
2046 } 2302 }
2047 wait_for_writer(trans, root); 2303 wait_for_writer(trans, root);
2048 if (batch == root->log_batch) 2304 if (batch == atomic_read(&root->log_batch))
2049 break; 2305 break;
2050 } 2306 }
2051 2307
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2074 2330
2075 btrfs_set_root_node(&log->root_item, log->node); 2331 btrfs_set_root_node(&log->root_item, log->node);
2076 2332
2077 root->log_batch = 0;
2078 root->log_transid++; 2333 root->log_transid++;
2079 log->log_transid = root->log_transid; 2334 log->log_transid = root->log_transid;
2080 root->log_start_pid = 0; 2335 root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2087 mutex_unlock(&root->log_mutex); 2342 mutex_unlock(&root->log_mutex);
2088 2343
2089 mutex_lock(&log_root_tree->log_mutex); 2344 mutex_lock(&log_root_tree->log_mutex);
2090 log_root_tree->log_batch++; 2345 atomic_inc(&log_root_tree->log_batch);
2091 atomic_inc(&log_root_tree->log_writers); 2346 atomic_inc(&log_root_tree->log_writers);
2092 mutex_unlock(&log_root_tree->log_mutex); 2347 mutex_unlock(&log_root_tree->log_mutex);
2093 2348
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2157 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2412 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2158 btrfs_header_level(log_root_tree->node)); 2413 btrfs_header_level(log_root_tree->node));
2159 2414
2160 log_root_tree->log_batch = 0;
2161 log_root_tree->log_transid++; 2415 log_root_tree->log_transid++;
2162 smp_mb(); 2416 smp_mb();
2163 2417
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2171 * in and cause problems either. 2425 * in and cause problems either.
2172 */ 2426 */
2173 btrfs_scrub_pause_super(root); 2427 btrfs_scrub_pause_super(root);
2174 write_ctree_super(trans, root->fs_info->tree_root, 1); 2428 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2175 btrfs_scrub_continue_super(root); 2429 btrfs_scrub_continue_super(root);
2176 ret = 0; 2430 if (ret) {
2431 btrfs_abort_transaction(trans, root, ret);
2432 goto out_wake_log_root;
2433 }
2177 2434
2178 mutex_lock(&root->log_mutex); 2435 mutex_lock(&root->log_mutex);
2179 if (root->last_log_commit < log_transid) 2436 if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2209 2466
2210 while (1) { 2467 while (1) {
2211 ret = find_first_extent_bit(&log->dirty_log_pages, 2468 ret = find_first_extent_bit(&log->dirty_log_pages,
2212 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2469 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2470 NULL);
2213 if (ret) 2471 if (ret)
2214 break; 2472 break;
2215 2473
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2646 int ret; 2904 int ret;
2647 struct btrfs_key key; 2905 struct btrfs_key key;
2648 struct btrfs_key found_key; 2906 struct btrfs_key found_key;
2907 int start_slot;
2649 2908
2650 key.objectid = objectid; 2909 key.objectid = objectid;
2651 key.type = max_key_type; 2910 key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2667 if (found_key.objectid != objectid) 2926 if (found_key.objectid != objectid)
2668 break; 2927 break;
2669 2928
2670 ret = btrfs_del_item(trans, log, path); 2929 found_key.offset = 0;
2671 if (ret) 2930 found_key.type = 0;
2931 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2932 &start_slot);
2933
2934 ret = btrfs_del_items(trans, log, path, start_slot,
2935 path->slots[0] - start_slot + 1);
2936 /*
2937 * If start slot isn't 0 then we don't need to re-search, we've
2938 * found the last guy with the objectid in this tree.
2939 */
2940 if (ret || start_slot != 0)
2672 break; 2941 break;
2673 btrfs_release_path(path); 2942 btrfs_release_path(path);
2674 } 2943 }
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2678 return ret; 2947 return ret;
2679} 2948}
2680 2949
2950static void fill_inode_item(struct btrfs_trans_handle *trans,
2951 struct extent_buffer *leaf,
2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only)
2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982
2983 if (log_inode_only) {
2984 /* set the generation to zero so the recover code
2985 * can tell the difference between an logging
2986 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values'
2988 */
2989 btrfs_set_inode_generation(leaf, item, 0);
2990 btrfs_set_inode_size(leaf, item, 0);
2991 } else {
2992 btrfs_set_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation);
2994 btrfs_set_inode_size(leaf, item, inode->i_size);
2995 }
2996
2997}
2998
2681static noinline int copy_items(struct btrfs_trans_handle *trans, 2999static noinline int copy_items(struct btrfs_trans_handle *trans,
2682 struct btrfs_root *log, 3000 struct inode *inode,
2683 struct btrfs_path *dst_path, 3001 struct btrfs_path *dst_path,
2684 struct extent_buffer *src, 3002 struct extent_buffer *src,
2685 int start_slot, int nr, int inode_only) 3003 int start_slot, int nr, int inode_only)
2686{ 3004{
2687 unsigned long src_offset; 3005 unsigned long src_offset;
2688 unsigned long dst_offset; 3006 unsigned long dst_offset;
3007 struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
2689 struct btrfs_file_extent_item *extent; 3008 struct btrfs_file_extent_item *extent;
2690 struct btrfs_inode_item *inode_item; 3009 struct btrfs_inode_item *inode_item;
2691 int ret; 3010 int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2694 char *ins_data; 3013 char *ins_data;
2695 int i; 3014 int i;
2696 struct list_head ordered_sums; 3015 struct list_head ordered_sums;
3016 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2697 3017
2698 INIT_LIST_HEAD(&ordered_sums); 3018 INIT_LIST_HEAD(&ordered_sums);
2699 3019
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2722 3042
2723 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3043 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2724 3044
2725 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3045 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2726 src_offset, ins_sizes[i]);
2727
2728 if (inode_only == LOG_INODE_EXISTS &&
2729 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2730 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3046 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2731 dst_path->slots[0], 3047 dst_path->slots[0],
2732 struct btrfs_inode_item); 3048 struct btrfs_inode_item);
2733 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 3049 fill_inode_item(trans, dst_path->nodes[0], inode_item,
2734 3050 inode, inode_only == LOG_INODE_EXISTS);
2735 /* set the generation to zero so the recover code 3051 } else {
2736 * can tell the difference between an logging 3052 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2737 * just to say 'this inode exists' and a logging 3053 src_offset, ins_sizes[i]);
2738 * to say 'update this inode with these values'
2739 */
2740 btrfs_set_inode_generation(dst_path->nodes[0],
2741 inode_item, 0);
2742 } 3054 }
3055
2743 /* take a reference on file data extents so that truncates 3056 /* take a reference on file data extents so that truncates
2744 * or deletes of this inode don't have to relog the inode 3057 * or deletes of this inode don't have to relog the inode
2745 * again 3058 * again
2746 */ 3059 */
2747 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 3060 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3061 !skip_csum) {
2748 int found_type; 3062 int found_type;
2749 extent = btrfs_item_ptr(src, start_slot + i, 3063 extent = btrfs_item_ptr(src, start_slot + i,
2750 struct btrfs_file_extent_item); 3064 struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2753 continue; 3067 continue;
2754 3068
2755 found_type = btrfs_file_extent_type(src, extent); 3069 found_type = btrfs_file_extent_type(src, extent);
2756 if (found_type == BTRFS_FILE_EXTENT_REG || 3070 if (found_type == BTRFS_FILE_EXTENT_REG) {
2757 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2758 u64 ds, dl, cs, cl; 3071 u64 ds, dl, cs, cl;
2759 ds = btrfs_file_extent_disk_bytenr(src, 3072 ds = btrfs_file_extent_disk_bytenr(src,
2760 extent); 3073 extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803 return ret; 3116 return ret;
2804} 3117}
2805 3118
3119static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3120{
3121 struct extent_map *em1, *em2;
3122
3123 em1 = list_entry(a, struct extent_map, list);
3124 em2 = list_entry(b, struct extent_map, list);
3125
3126 if (em1->start < em2->start)
3127 return -1;
3128 else if (em1->start > em2->start)
3129 return 1;
3130 return 0;
3131}
3132
3133struct log_args {
3134 struct extent_buffer *src;
3135 u64 next_offset;
3136 int start_slot;
3137 int nr;
3138};
3139
3140static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path,
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{
3145 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi;
3147 struct btrfs_key key;
3148 u64 start = em->mod_start;
3149 u64 search_start = start;
3150 u64 len = em->mod_len;
3151 u64 num_bytes;
3152 int nritems;
3153 int ret;
3154
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) {
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
3157 start + len, NULL, 0);
3158 if (ret)
3159 return ret;
3160 }
3161
3162 while (len) {
3163 if (args->nr)
3164 goto next_slot;
3165again:
3166 key.objectid = btrfs_ino(inode);
3167 key.type = BTRFS_EXTENT_DATA_KEY;
3168 key.offset = search_start;
3169
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3171 if (ret < 0)
3172 return ret;
3173
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3203 struct btrfs_file_extent_item);
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250
3251 if (path->slots[0] < nritems) {
3252 if (len)
3253 goto next_slot;
3254 break;
3255 }
3256
3257 if (args->nr) {
3258 ret = copy_items(trans, inode, dst_path, args->src,
3259 args->start_slot, args->nr,
3260 LOG_INODE_ALL);
3261 if (ret)
3262 return ret;
3263 args->nr = 0;
3264 btrfs_release_path(path);
3265 }
3266 }
3267
3268 return 0;
3269}
3270
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root,
3273 struct inode *inode,
3274 struct btrfs_path *path,
3275 struct btrfs_path *dst_path)
3276{
3277 struct log_args args;
3278 struct extent_map *em, *n;
3279 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3281 u64 test_gen;
3282 int ret = 0;
3283
3284 INIT_LIST_HEAD(&extents);
3285
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed;
3290
3291 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3292 list_del_init(&em->list);
3293 if (em->generation <= test_gen)
3294 continue;
3295 /* Need a ref to keep it from getting evicted from cache */
3296 atomic_inc(&em->refs);
3297 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3298 list_add_tail(&em->list, &extents);
3299 }
3300
3301 list_sort(NULL, &extents, extent_cmp);
3302
3303 while (!list_empty(&extents)) {
3304 em = list_entry(extents.next, struct extent_map, list);
3305
3306 list_del_init(&em->list);
3307 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3308
3309 /*
3310 * If we had an error we just need to delete everybody from our
3311 * private list.
3312 */
3313 if (ret) {
3314 free_extent_map(em);
3315 continue;
3316 }
3317
3318 write_unlock(&tree->lock);
3319
3320 /*
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em);
3340 write_lock(&tree->lock);
3341 }
3342 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock);
3344
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path);
3349 return ret;
3350}
3351
2806/* log a single inode in the tree log. 3352/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree 3353 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already. 3354 * or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832 int nritems; 3378 int nritems;
2833 int ins_start_slot = 0; 3379 int ins_start_slot = 0;
2834 int ins_nr; 3380 int ins_nr;
3381 bool fast_search = false;
2835 u64 ino = btrfs_ino(inode); 3382 u64 ino = btrfs_ino(inode);
2836 3383
2837 log = root->log_root; 3384 log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851 3398
2852 max_key.objectid = ino; 3399 max_key.objectid = ino;
2853 3400
2854 /* today the code can only do partial logging of directories */
2855 if (!S_ISDIR(inode->i_mode))
2856 inode_only = LOG_INODE_ALL;
2857 3401
3402 /* today the code can only do partial logging of directories */
2858 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2859 max_key.type = BTRFS_XATTR_ITEM_KEY; 3404 max_key.type = BTRFS_XATTR_ITEM_KEY;
2860 else 3405 else
2861 max_key.type = (u8)-1; 3406 max_key.type = (u8)-1;
2862 max_key.offset = (u64)-1; 3407 max_key.offset = (u64)-1;
2863 3408
2864 ret = btrfs_commit_inode_delayed_items(trans, inode); 3409 /* Only run delayed items if we are a dir or a new file */
2865 if (ret) { 3410 if (S_ISDIR(inode->i_mode) ||
2866 btrfs_free_path(path); 3411 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
2867 btrfs_free_path(dst_path); 3412 ret = btrfs_commit_inode_delayed_items(trans, inode);
2868 return ret; 3413 if (ret) {
3414 btrfs_free_path(path);
3415 btrfs_free_path(dst_path);
3416 return ret;
3417 }
2869 } 3418 }
2870 3419
2871 mutex_lock(&BTRFS_I(inode)->log_mutex); 3420 mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881 max_key_type = BTRFS_XATTR_ITEM_KEY; 3430 max_key_type = BTRFS_XATTR_ITEM_KEY;
2882 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3431 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883 } else { 3432 } else {
2884 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) {
3435 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0);
3437 } else {
3438 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY);
3442 }
2885 } 3443 }
2886 if (ret) { 3444 if (ret) {
2887 err = ret; 3445 err = ret;
@@ -2912,7 +3470,7 @@ again:
2912 goto next_slot; 3470 goto next_slot;
2913 } 3471 }
2914 3472
2915 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 3473 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2916 ins_nr, inode_only); 3474 ins_nr, inode_only);
2917 if (ret) { 3475 if (ret) {
2918 err = ret; 3476 err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
2930 goto again; 3488 goto again;
2931 } 3489 }
2932 if (ins_nr) { 3490 if (ins_nr) {
2933 ret = copy_items(trans, log, dst_path, src, 3491 ret = copy_items(trans, inode, dst_path, src,
2934 ins_start_slot, 3492 ins_start_slot,
2935 ins_nr, inode_only); 3493 ins_nr, inode_only);
2936 if (ret) { 3494 if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
2951 break; 3509 break;
2952 } 3510 }
2953 if (ins_nr) { 3511 if (ins_nr) {
2954 ret = copy_items(trans, log, dst_path, src, 3512 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2955 ins_start_slot,
2956 ins_nr, inode_only); 3513 ins_nr, inode_only);
2957 if (ret) { 3514 if (ret) {
2958 err = ret; 3515 err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
2960 } 3517 }
2961 ins_nr = 0; 3518 ins_nr = 0;
2962 } 3519 }
2963 WARN_ON(ins_nr); 3520
3521 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path,
3525 dst_path);
3526 if (ret) {
3527 err = ret;
3528 goto out_unlock;
3529 }
3530 } else {
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n;
3533
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list);
3536 }
3537
2964 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965 btrfs_release_path(path); 3539 btrfs_release_path(path);
2966 btrfs_release_path(dst_path); 3540 btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
2971 } 3545 }
2972 } 3546 }
2973 BTRFS_I(inode)->logged_trans = trans->transid; 3547 BTRFS_I(inode)->logged_trans = trans->transid;
3548 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
2974out_unlock: 3549out_unlock:
2975 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3550 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2976 3551
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3138end_trans: 3713end_trans:
3139 dput(old_parent); 3714 dput(old_parent);
3140 if (ret < 0) { 3715 if (ret < 0) {
3141 BUG_ON(ret != -ENOSPC); 3716 WARN_ON(ret != -ENOSPC);
3142 root->fs_info->last_trans_log_full_commit = trans->transid; 3717 root->fs_info->last_trans_log_full_commit = trans->transid;
3143 ret = 1; 3718 ret = 1;
3144 } 3719 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
143 * In case of allocation failure -ENOMEM is returned and the ulist stays 143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
147 gfp_t gfp_mask)
148{ 147{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); 148 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150} 149}
151 150
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 151int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
153 unsigned long *old_aux, gfp_t gfp_mask) 152 u64 *old_aux, gfp_t gfp_mask)
154{ 153{
155 int i; 154 int i;
156 155
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
33 */ 33 */
34struct ulist_node { 34struct ulist_node {
35 u64 val; /* value to store */ 35 u64 val; /* value to store */
36 unsigned long aux; /* auxiliary value saved along with the val */ 36 u64 aux; /* auxiliary value saved along with the val */
37}; 37};
38 38
39struct ulist { 39struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
65void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
66struct ulist *ulist_alloc(gfp_t gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
67void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
69 gfp_t gfp_mask); 69int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 70 u64 *old_aux, gfp_t gfp_mask);
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist, 71struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter); 72 struct ulist_iterator *uiter);
74 73
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
639 639
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 640 bdev = blkdev_get_by_path(device->name->str, flags, holder);
641 if (IS_ERR(bdev)) { 641 if (IS_ERR(bdev)) {
642 printk(KERN_INFO "open %s failed\n", device->name->str); 642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
643 goto error; 643 goto error;
644 } 644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping); 645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1475 free_fs_devices(cur_devices); 1475 free_fs_devices(cur_devices);
1476 } 1476 }
1477 1477
1478 root->fs_info->num_tolerated_disk_barrier_failures =
1479 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1480
1478 /* 1481 /*
1479 * at this point, the device is zero sized. We want to 1482 * at this point, the device is zero sized. We want to
1480 * remove it from the devices list and zero out the old super 1483 * remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1775 1778
1776 if (seeding_dev) { 1779 if (seeding_dev) {
1777 ret = init_first_rw_device(trans, root, device); 1780 ret = init_first_rw_device(trans, root, device);
1778 if (ret) 1781 if (ret) {
1782 btrfs_abort_transaction(trans, root, ret);
1779 goto error_trans; 1783 goto error_trans;
1784 }
1780 ret = btrfs_finish_sprout(trans, root); 1785 ret = btrfs_finish_sprout(trans, root);
1781 if (ret) 1786 if (ret) {
1787 btrfs_abort_transaction(trans, root, ret);
1782 goto error_trans; 1788 goto error_trans;
1789 }
1783 } else { 1790 } else {
1784 ret = btrfs_add_device(trans, root, device); 1791 ret = btrfs_add_device(trans, root, device);
1785 if (ret) 1792 if (ret) {
1793 btrfs_abort_transaction(trans, root, ret);
1786 goto error_trans; 1794 goto error_trans;
1795 }
1787 } 1796 }
1788 1797
1789 /* 1798 /*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1793 btrfs_clear_space_info_full(root->fs_info); 1802 btrfs_clear_space_info_full(root->fs_info);
1794 1803
1795 unlock_chunks(root); 1804 unlock_chunks(root);
1805 root->fs_info->num_tolerated_disk_barrier_failures =
1806 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1796 ret = btrfs_commit_transaction(trans, root); 1807 ret = btrfs_commit_transaction(trans, root);
1797 1808
1798 if (seeding_dev) { 1809 if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1814 1825
1815error_trans: 1826error_trans:
1816 unlock_chunks(root); 1827 unlock_chunks(root);
1817 btrfs_abort_transaction(trans, root, ret);
1818 btrfs_end_transaction(trans, root); 1828 btrfs_end_transaction(trans, root);
1819 rcu_string_free(device->name); 1829 rcu_string_free(device->name);
1820 kfree(device); 1830 kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2804 } 2814 }
2805 } 2815 }
2806 2816
2817 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2818 int num_tolerated_disk_barrier_failures;
2819 u64 target = bctl->sys.target;
2820
2821 num_tolerated_disk_barrier_failures =
2822 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2823 if (num_tolerated_disk_barrier_failures > 0 &&
2824 (target &
2825 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
2826 BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
2827 num_tolerated_disk_barrier_failures = 0;
2828 else if (num_tolerated_disk_barrier_failures > 1 &&
2829 (target &
2830 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
2831 num_tolerated_disk_barrier_failures = 1;
2832
2833 fs_info->num_tolerated_disk_barrier_failures =
2834 num_tolerated_disk_barrier_failures;
2835 }
2836
2807 ret = insert_balance_item(fs_info->tree_root, bctl); 2837 ret = insert_balance_item(fs_info->tree_root, bctl);
2808 if (ret && ret != -EEXIST) 2838 if (ret && ret != -EEXIST)
2809 goto out; 2839 goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2836 __cancel_balance(fs_info); 2866 __cancel_balance(fs_info);
2837 } 2867 }
2838 2868
2869 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2870 fs_info->num_tolerated_disk_barrier_failures =
2871 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2872 }
2873
2839 wake_up(&fs_info->balance_wait_q); 2874 wake_up(&fs_info->balance_wait_q);
2840 2875
2841 return ret; 2876 return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3608 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3643 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3609 &sys_chunk_size, &sys_stripe_size, 3644 &sys_chunk_size, &sys_stripe_size,
3610 sys_chunk_offset, alloc_profile); 3645 sys_chunk_offset, alloc_profile);
3611 if (ret) 3646 if (ret) {
3612 goto abort; 3647 btrfs_abort_transaction(trans, root, ret);
3648 goto out;
3649 }
3613 3650
3614 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3651 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3615 if (ret) 3652 if (ret) {
3616 goto abort; 3653 btrfs_abort_transaction(trans, root, ret);
3654 goto out;
3655 }
3617 3656
3618 /* 3657 /*
3619 * Modifying chunk tree needs allocating new blocks from both 3658 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3623 */ 3662 */
3624 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3663 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3625 chunk_size, stripe_size); 3664 chunk_size, stripe_size);
3626 if (ret) 3665 if (ret) {
3627 goto abort; 3666 btrfs_abort_transaction(trans, root, ret);
3667 goto out;
3668 }
3628 3669
3629 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3670 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3630 sys_chunk_offset, sys_chunk_size, 3671 sys_chunk_offset, sys_chunk_size,
3631 sys_stripe_size); 3672 sys_stripe_size);
3632 if (ret) 3673 if (ret)
3633 goto abort; 3674 btrfs_abort_transaction(trans, root, ret);
3634 3675
3635 return 0; 3676out:
3636 3677
3637abort:
3638 btrfs_abort_transaction(trans, root, ret);
3639 return ret; 3678 return ret;
3640} 3679}
3641 3680
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3760 read_unlock(&em_tree->lock); 3799 read_unlock(&em_tree->lock);
3761 3800
3762 if (!em) { 3801 if (!em) {
3763 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3802 printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
3764 (unsigned long long)logical, 3803 (unsigned long long)logical,
3765 (unsigned long long)*length); 3804 (unsigned long long)*length);
3766 BUG(); 3805 BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4217 4256
4218 total_devs = bbio->num_stripes; 4257 total_devs = bbio->num_stripes;
4219 if (map_length < length) { 4258 if (map_length < length) {
4220 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4259 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4221 "len %llu\n", (unsigned long long)logical, 4260 "len %llu\n", (unsigned long long)logical,
4222 (unsigned long long)length, 4261 (unsigned long long)length,
4223 (unsigned long long)map_length); 4262 (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
97 *total_in = 0; 97 *total_in = 0;
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "btrfs: deflateInit failed\n");
101 ret = -1; 101 ret = -1;
102 goto out; 102 goto out;
103 } 103 }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
125 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 127 if (ret != Z_OK) {
128 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", 128 printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
252 } 252 }
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "btrfs: inflateInit failed\n");
256 return -1; 256 return -1;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
336 } 336 }
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "btrfs: inflateInit failed\n");
340 return -1; 340 return -1;
341 } 341 }
342 342
diff --git a/fs/buffer.c b/fs/buffer.c
index 58e2e7b77372..b5f044283edb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2312 loff_t size; 2312 loff_t size;
2313 int ret; 2313 int ret;
2314 2314
2315 /*
2316 * Update file times before taking page lock. We may end up failing the
2317 * fault so this update may be superfluous but who really cares...
2318 */
2319 file_update_time(vma->vm_file);
2320
2321 lock_page(page); 2315 lock_page(page);
2322 size = i_size_read(inode); 2316 size = i_size_read(inode);
2323 if ((page->mapping != inode->i_mapping) || 2317 if ((page->mapping != inode->i_mapping) ||
@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2355 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2349 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2356 2350
2357 sb_start_pagefault(sb); 2351 sb_start_pagefault(sb);
2352
2353 /*
2354 * Update file times before taking page lock. We may end up failing the
2355 * fault so this update may be superfluous but who really cares...
2356 */
2357 file_update_time(vma->vm_file);
2358
2358 ret = __block_page_mkwrite(vma, vmf, get_block); 2359 ret = __block_page_mkwrite(vma, vmf, get_block);
2359 sb_end_pagefault(sb); 2360 sb_end_pagefault(sb);
2360 return block_page_mkwrite_return(ret); 2361 return block_page_mkwrite_return(ret);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 452e71a1b753..6690269f5dde 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -205,7 +205,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
205 dout("readpage inode %p file %p page %p index %lu\n", 205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index); 206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 page->index << PAGE_CACHE_SHIFT, &len, 208 (u64) page_offset(page), &len,
209 ci->i_truncate_seq, ci->i_truncate_size, 209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1, 0); 210 &page, 1, 0);
211 if (err == -ENOENT) 211 if (err == -ENOENT)
@@ -286,7 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
286 int nr_pages = 0; 286 int nr_pages = 0;
287 int ret; 287 int ret;
288 288
289 off = page->index << PAGE_CACHE_SHIFT; 289 off = (u64) page_offset(page);
290 290
291 /* count pages */ 291 /* count pages */
292 next_index = page->index; 292 next_index = page->index;
@@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
308 NULL, 0, 308 NULL, 0,
309 ci->i_truncate_seq, ci->i_truncate_size, 309 ci->i_truncate_seq, ci->i_truncate_size,
310 NULL, false, 1, 0); 310 NULL, false, 1, 0);
311 if (!req) 311 if (IS_ERR(req))
312 return -ENOMEM; 312 return PTR_ERR(req);
313 313
314 /* build page vector */ 314 /* build page vector */
315 nr_pages = len >> PAGE_CACHE_SHIFT; 315 nr_pages = len >> PAGE_CACHE_SHIFT;
@@ -426,7 +426,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
426 struct ceph_inode_info *ci; 426 struct ceph_inode_info *ci;
427 struct ceph_fs_client *fsc; 427 struct ceph_fs_client *fsc;
428 struct ceph_osd_client *osdc; 428 struct ceph_osd_client *osdc;
429 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 429 loff_t page_off = page_offset(page);
430 int len = PAGE_CACHE_SIZE; 430 int len = PAGE_CACHE_SIZE;
431 loff_t i_size; 431 loff_t i_size;
432 int err = 0; 432 int err = 0;
@@ -817,8 +817,7 @@ get_more_pages:
817 /* ok */ 817 /* ok */
818 if (locked_pages == 0) { 818 if (locked_pages == 0) {
819 /* prepare async write request */ 819 /* prepare async write request */
820 offset = (unsigned long long)page->index 820 offset = (u64) page_offset(page);
821 << PAGE_CACHE_SHIFT;
822 len = wsize; 821 len = wsize;
823 req = ceph_osdc_new_request(&fsc->client->osdc, 822 req = ceph_osdc_new_request(&fsc->client->osdc,
824 &ci->i_layout, 823 &ci->i_layout,
@@ -832,8 +831,8 @@ get_more_pages:
832 ci->i_truncate_size, 831 ci->i_truncate_size,
833 &inode->i_mtime, true, 1, 0); 832 &inode->i_mtime, true, 1, 0);
834 833
835 if (!req) { 834 if (IS_ERR(req)) {
836 rc = -ENOMEM; 835 rc = PTR_ERR(req);
837 unlock_page(page); 836 unlock_page(page);
838 break; 837 break;
839 } 838 }
@@ -1180,7 +1179,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1180 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1179 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1181 struct page *page = vmf->page; 1180 struct page *page = vmf->page;
1182 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1181 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1183 loff_t off = page->index << PAGE_CACHE_SHIFT; 1182 loff_t off = page_offset(page);
1184 loff_t size, len; 1183 loff_t size, len;
1185 int ret; 1184 int ret;
1186 1185
@@ -1225,6 +1224,7 @@ out:
1225static struct vm_operations_struct ceph_vmops = { 1224static struct vm_operations_struct ceph_vmops = {
1226 .fault = filemap_fault, 1225 .fault = filemap_fault,
1227 .page_mkwrite = ceph_page_mkwrite, 1226 .page_mkwrite = ceph_page_mkwrite,
1227 .remap_pages = generic_file_remap_pages,
1228}; 1228};
1229 1229
1230int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1230int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1235,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1235 return -ENOEXEC; 1235 return -ENOEXEC;
1236 file_accessed(file); 1236 file_accessed(file);
1237 vma->vm_ops = &ceph_vmops; 1237 vma->vm_ops = &ceph_vmops;
1238 vma->vm_flags |= VM_CAN_NONLINEAR;
1239 return 0; 1238 return 0;
1240} 1239}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 620daad201db..3251e9cc6401 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1005,7 +1005,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
1005 1005
1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); 1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1007 head = msg->front.iov_base; 1007 head = msg->front.iov_base;
1008 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); 1008 le32_add_cpu(&head->num, 1);
1009 item = msg->front.iov_base + msg->front.iov_len; 1009 item = msg->front.iov_base + msg->front.iov_len;
1010 item->ino = cpu_to_le64(ino); 1010 item->ino = cpu_to_le64(ino);
1011 item->cap_id = cpu_to_le64(cap_id); 1011 item->cap_id = cpu_to_le64(cap_id);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e557b6..02ce90972d81 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -99,7 +99,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
99 * FIXME: we should try harder by querying the mds for the ino. 99 * FIXME: we should try harder by querying the mds for the ino.
100 */ 100 */
101static struct dentry *__fh_to_dentry(struct super_block *sb, 101static struct dentry *__fh_to_dentry(struct super_block *sb,
102 struct ceph_nfs_fh *fh) 102 struct ceph_nfs_fh *fh, int fh_len)
103{ 103{
104 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 104 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
105 struct inode *inode; 105 struct inode *inode;
@@ -107,6 +107,9 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
107 struct ceph_vino vino; 107 struct ceph_vino vino;
108 int err; 108 int err;
109 109
110 if (fh_len < sizeof(*fh) / 4)
111 return ERR_PTR(-ESTALE);
112
110 dout("__fh_to_dentry %llx\n", fh->ino); 113 dout("__fh_to_dentry %llx\n", fh->ino);
111 vino.ino = fh->ino; 114 vino.ino = fh->ino;
112 vino.snap = CEPH_NOSNAP; 115 vino.snap = CEPH_NOSNAP;
@@ -150,7 +153,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
150 * convert connectable fh to dentry 153 * convert connectable fh to dentry
151 */ 154 */
152static struct dentry *__cfh_to_dentry(struct super_block *sb, 155static struct dentry *__cfh_to_dentry(struct super_block *sb,
153 struct ceph_nfs_confh *cfh) 156 struct ceph_nfs_confh *cfh, int fh_len)
154{ 157{
155 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 158 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
156 struct inode *inode; 159 struct inode *inode;
@@ -158,6 +161,9 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
158 struct ceph_vino vino; 161 struct ceph_vino vino;
159 int err; 162 int err;
160 163
164 if (fh_len < sizeof(*cfh) / 4)
165 return ERR_PTR(-ESTALE);
166
161 dout("__cfh_to_dentry %llx (%llx/%x)\n", 167 dout("__cfh_to_dentry %llx (%llx/%x)\n",
162 cfh->ino, cfh->parent_ino, cfh->parent_name_hash); 168 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
163 169
@@ -207,9 +213,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
207 int fh_len, int fh_type) 213 int fh_len, int fh_type)
208{ 214{
209 if (fh_type == 1) 215 if (fh_type == 1)
210 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); 216 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
217 fh_len);
211 else 218 else
212 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); 219 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
220 fh_len);
213} 221}
214 222
215/* 223/*
@@ -230,6 +238,8 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
230 238
231 if (fh_type == 1) 239 if (fh_type == 1)
232 return ERR_PTR(-ESTALE); 240 return ERR_PTR(-ESTALE);
241 if (fh_len < sizeof(*cfh) / 4)
242 return ERR_PTR(-ESTALE);
233 243
234 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, 244 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
235 cfh->parent_name_hash); 245 cfh->parent_name_hash);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ecebbc09bfc7..5840d2aaed15 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -536,8 +536,8 @@ more:
536 do_sync, 536 do_sync,
537 ci->i_truncate_seq, ci->i_truncate_size, 537 ci->i_truncate_seq, ci->i_truncate_size,
538 &mtime, false, 2, page_align); 538 &mtime, false, 2, page_align);
539 if (!req) 539 if (IS_ERR(req))
540 return -ENOMEM; 540 return PTR_ERR(req);
541 541
542 if (file->f_flags & O_DIRECT) { 542 if (file->f_flags & O_DIRECT) {
543 pages = ceph_get_direct_page_vector(data, num_pages, false); 543 pages = ceph_get_direct_page_vector(data, num_pages, false);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4b5762ef7c2b..ba95eea201bf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1104,7 +1104,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1104 pr_err("fill_trace bad get_inode " 1104 pr_err("fill_trace bad get_inode "
1105 "%llx.%llx\n", vino.ino, vino.snap); 1105 "%llx.%llx\n", vino.ino, vino.snap);
1106 err = PTR_ERR(in); 1106 err = PTR_ERR(in);
1107 d_delete(dn); 1107 d_drop(dn);
1108 goto done; 1108 goto done;
1109 } 1109 }
1110 dn = splice_dentry(dn, in, &have_lease, true); 1110 dn = splice_dentry(dn, in, &have_lease, true);
@@ -1277,7 +1277,7 @@ retry_lookup:
1277 in = ceph_get_inode(parent->d_sb, vino); 1277 in = ceph_get_inode(parent->d_sb, vino);
1278 if (IS_ERR(in)) { 1278 if (IS_ERR(in)) {
1279 dout("new_inode badness\n"); 1279 dout("new_inode badness\n");
1280 d_delete(dn); 1280 d_drop(dn);
1281 dput(dn); 1281 dput(dn);
1282 err = PTR_ERR(in); 1282 err = PTR_ERR(in);
1283 goto out; 1283 goto out;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 1396ceb46797..36549a46e311 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -187,14 +187,18 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
187 u64 tmp; 187 u64 tmp;
188 struct ceph_object_layout ol; 188 struct ceph_object_layout ol;
189 struct ceph_pg pgid; 189 struct ceph_pg pgid;
190 int r;
190 191
191 /* copy and validate */ 192 /* copy and validate */
192 if (copy_from_user(&dl, arg, sizeof(dl))) 193 if (copy_from_user(&dl, arg, sizeof(dl)))
193 return -EFAULT; 194 return -EFAULT;
194 195
195 down_read(&osdc->map_sem); 196 down_read(&osdc->map_sem);
196 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
197 &dl.object_no, &dl.object_offset, &olen); 198 &dl.object_no, &dl.object_offset,
199 &olen);
200 if (r < 0)
201 return -EIO;
198 dl.file_offset -= dl.object_offset; 202 dl.file_offset -= dl.object_offset;
199 dl.object_size = ceph_file_layout_object_size(ci->i_layout); 203 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
200 dl.block_size = ceph_file_layout_su(ci->i_layout); 204 dl.block_size = ceph_file_layout_su(ci->i_layout);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a5a735422aa7..1bcf712655d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2625,7 +2625,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2625 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 2625 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2626 session_state_name(s->s_state)); 2626 session_state_name(s->s_state));
2627 2627
2628 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2628 if (i >= newmap->m_max_mds ||
2629 memcmp(ceph_mdsmap_get_addr(oldmap, i),
2629 ceph_mdsmap_get_addr(newmap, i), 2630 ceph_mdsmap_get_addr(newmap, i),
2630 sizeof(struct ceph_entity_addr))) { 2631 sizeof(struct ceph_entity_addr))) {
2631 if (s->s_state == CEPH_MDS_SESSION_OPENING) { 2632 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b982239f38f9..2eb43f211325 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -307,7 +307,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
307{ 307{
308 struct ceph_mount_options *fsopt; 308 struct ceph_mount_options *fsopt;
309 const char *dev_name_end; 309 const char *dev_name_end;
310 int err = -ENOMEM; 310 int err;
311
312 if (!dev_name || !*dev_name)
313 return -EINVAL;
311 314
312 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); 315 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
313 if (!fsopt) 316 if (!fsopt)
@@ -328,21 +331,33 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
328 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 331 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
329 fsopt->congestion_kb = default_congestion_kb(); 332 fsopt->congestion_kb = default_congestion_kb();
330 333
331 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 334 /*
335 * Distinguish the server list from the path in "dev_name".
336 * Internally we do not include the leading '/' in the path.
337 *
338 * "dev_name" will look like:
339 * <server_spec>[,<server_spec>...]:[<path>]
340 * where
341 * <server_spec> is <ip>[:<port>]
342 * <path> is optional, but if present must begin with '/'
343 */
344 dev_name_end = strchr(dev_name, '/');
345 if (dev_name_end) {
346 /* skip over leading '/' for path */
347 *path = dev_name_end + 1;
348 } else {
349 /* path is empty */
350 dev_name_end = dev_name + strlen(dev_name);
351 *path = dev_name_end;
352 }
332 err = -EINVAL; 353 err = -EINVAL;
333 if (!dev_name) 354 dev_name_end--; /* back up to ':' separator */
334 goto out; 355 if (*dev_name_end != ':') {
335 *path = strstr(dev_name, ":/"); 356 pr_err("device name is missing path (no : separator in %s)\n",
336 if (*path == NULL) {
337 pr_err("device name is missing path (no :/ in %s)\n",
338 dev_name); 357 dev_name);
339 goto out; 358 goto out;
340 } 359 }
341 dev_name_end = *path;
342 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 360 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
343
344 /* path on server */
345 *path += 2;
346 dout("server path '%s'\n", *path); 361 dout("server path '%s'\n", *path);
347 362
348 *popt = ceph_parse_options(options, dev_name, dev_name_end, 363 *popt = ceph_parse_options(options, dev_name, dev_name_end,
@@ -603,6 +618,11 @@ bad_cap:
603 618
604static void destroy_caches(void) 619static void destroy_caches(void)
605{ 620{
621 /*
622 * Make sure all delayed rcu free inodes are flushed before we
623 * destroy cache.
624 */
625 rcu_barrier();
606 kmem_cache_destroy(ceph_inode_cachep); 626 kmem_cache_destroy(ceph_inode_cachep);
607 kmem_cache_destroy(ceph_cap_cachep); 627 kmem_cache_destroy(ceph_cap_cachep);
608 kmem_cache_destroy(ceph_dentry_cachep); 628 kmem_cache_destroy(ceph_dentry_cachep);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index a08306a8bec9..2075ddfffa73 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -9,13 +9,14 @@ config CIFS
9 select CRYPTO_ARC4 9 select CRYPTO_ARC4
10 select CRYPTO_ECB 10 select CRYPTO_ECB
11 select CRYPTO_DES 11 select CRYPTO_DES
12 select CRYPTO_SHA256
12 help 13 help
13 This is the client VFS module for the Common Internet File System 14 This is the client VFS module for the Common Internet File System
14 (CIFS) protocol which is the successor to the Server Message Block 15 (CIFS) protocol which is the successor to the Server Message Block
15 (SMB) protocol, the native file sharing mechanism for most early 16 (SMB) protocol, the native file sharing mechanism for most early
16 PC operating systems. The CIFS protocol is fully supported by 17 PC operating systems. The CIFS protocol is fully supported by
17 file servers such as Windows 2000 (including Windows 2003, NT 4 18 file servers such as Windows 2000 (including Windows 2003, Windows 2008,
18 and Windows XP) as well by Samba (which provides excellent CIFS 19 NT 4 and Windows XP) as well by Samba (which provides excellent CIFS
19 server support for Linux and many other operating systems). Limited 20 server support for Linux and many other operating systems). Limited
20 support for OS/2 and Windows ME and similar servers is provided as 21 support for OS/2 and Windows ME and similar servers is provided as
21 well. 22 well.
@@ -114,6 +115,13 @@ config CIFS_POSIX
114 (such as Samba 3.10 and later) which can negotiate 115 (such as Samba 3.10 and later) which can negotiate
115 CIFS POSIX ACL support. If unsure, say N. 116 CIFS POSIX ACL support. If unsure, say N.
116 117
118config CIFS_ACL
119 bool "Provide CIFS ACL support"
120 depends on CIFS_XATTR && KEYS
121 help
122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
123 is handed over to the application/caller.
124
117config CIFS_DEBUG2 125config CIFS_DEBUG2
118 bool "Enable additional CIFS debugging routines" 126 bool "Enable additional CIFS debugging routines"
119 depends on CIFS 127 depends on CIFS
@@ -138,21 +146,6 @@ config CIFS_DFS_UPCALL
138 IP addresses) which is needed for implicit mounts of DFS junction 146 IP addresses) which is needed for implicit mounts of DFS junction
139 points. If unsure, say N. 147 points. If unsure, say N.
140 148
141config CIFS_FSCACHE
142 bool "Provide CIFS client caching support"
143 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
144 help
145 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
146 to be cached locally on disk through the general filesystem cache
147 manager. If unsure, say N.
148
149config CIFS_ACL
150 bool "Provide CIFS ACL support"
151 depends on CIFS_XATTR && KEYS
152 help
153 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
154 is handed over to the application/caller.
155
156config CIFS_NFSD_EXPORT 149config CIFS_NFSD_EXPORT
157 bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" 150 bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
158 depends on CIFS && EXPERIMENTAL && BROKEN 151 depends on CIFS && EXPERIMENTAL && BROKEN
@@ -161,7 +154,7 @@ config CIFS_NFSD_EXPORT
161 154
162config CIFS_SMB2 155config CIFS_SMB2
163 bool "SMB2 network file system support (EXPERIMENTAL)" 156 bool "SMB2 network file system support (EXPERIMENTAL)"
164 depends on EXPERIMENTAL && INET && BROKEN 157 depends on CIFS && EXPERIMENTAL && INET
165 select NLS 158 select NLS
166 select KEYS 159 select KEYS
167 select FSCACHE 160 select FSCACHE
@@ -178,3 +171,12 @@ config CIFS_SMB2
178 (compared to cifs) due to protocol improvements. 171 (compared to cifs) due to protocol improvements.
179 172
180 Unless you are a developer or tester, say N. 173 Unless you are a developer or tester, say N.
174
175config CIFS_FSCACHE
176 bool "Provide CIFS client caching support"
177 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
178 help
179 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
180 to be cached locally on disk through the general filesystem cache
181 manager. If unsure, say N.
182
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index feee94309271..aa0d68b086eb 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -17,4 +17,4 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
17cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o 17cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
18 18
19cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \ 19cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \
20 smb2misc.o smb2pdu.o smb2inode.o 20 smb2misc.o smb2pdu.o smb2inode.o smb2file.o
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index e622863b292f..086f381d6489 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -31,18 +31,18 @@
31 31
32/* create a new cifs key */ 32/* create a new cifs key */
33static int 33static int
34cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) 34cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
35{ 35{
36 char *payload; 36 char *payload;
37 int ret; 37 int ret;
38 38
39 ret = -ENOMEM; 39 ret = -ENOMEM;
40 payload = kmalloc(datalen, GFP_KERNEL); 40 payload = kmalloc(prep->datalen, GFP_KERNEL);
41 if (!payload) 41 if (!payload)
42 goto error; 42 goto error;
43 43
44 /* attach the data */ 44 /* attach the data */
45 memcpy(payload, data, datalen); 45 memcpy(payload, prep->data, prep->datalen);
46 key->payload.data = payload; 46 key->payload.data = payload;
47 ret = 0; 47 ret = 0;
48 48
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7dab9c04ad52..71d5d0a5f6b2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -203,6 +203,27 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
203 int i; 203 int i;
204 wchar_t wchar_to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
205 205
206 /* special case for utf8 to handle no plane0 chars */
207 if (!strcmp(codepage->charset, "utf8")) {
208 /*
209 * convert utf8 -> utf16, we assume we have enough space
210 * as caller should have assumed conversion does not overflow
211 * in destination len is length in wchar_t units (16bits)
212 */
213 i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
214 (wchar_t *) to, len);
215
216 /* if success terminate and exit */
217 if (i >= 0)
218 goto success;
219 /*
220 * if fails fall back to UCS encoding as this
221 * function should not return negative values
222 * currently can fail only if source contains
223 * invalid encoded characters
224 */
225 }
226
206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 227 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
207 charlen = codepage->char2uni(from, len, &wchar_to); 228 charlen = codepage->char2uni(from, len, &wchar_to);
208 if (charlen < 1) { 229 if (charlen < 1) {
@@ -215,6 +236,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
215 put_unaligned_le16(wchar_to, &to[i]); 236 put_unaligned_le16(wchar_to, &to[i]);
216 } 237 }
217 238
239success:
218 put_unaligned_le16(0, &to[i]); 240 put_unaligned_le16(0, &to[i]);
219 return i; 241 return i;
220} 242}
@@ -328,7 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
328 } 350 }
329 351
330ctoUTF16_out: 352ctoUTF16_out:
331 return i; 353 return j;
332} 354}
333 355
334#ifdef CONFIG_CIFS_SMB2 356#ifdef CONFIG_CIFS_SMB2
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 05f4dc263a23..fc783e264420 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = {
167}; 167};
168 168
169static int 169static int
170cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) 170cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
171{ 171{
172 char *payload; 172 char *payload;
173 173
174 payload = kmalloc(datalen, GFP_KERNEL); 174 payload = kmalloc(prep->datalen, GFP_KERNEL);
175 if (!payload) 175 if (!payload)
176 return -ENOMEM; 176 return -ENOMEM;
177 177
178 memcpy(payload, data, datalen); 178 memcpy(payload, prep->data, prep->datalen);
179 key->payload.data = payload; 179 key->payload.data = payload;
180 key->datalen = datalen; 180 key->datalen = prep->datalen;
181 return 0; 181 return 0;
182} 182}
183 183
@@ -1222,7 +1222,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
1222 if (!open_file) 1222 if (!open_file)
1223 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 1223 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
1224 1224
1225 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); 1225 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen);
1226 cifsFileInfo_put(open_file); 1226 cifsFileInfo_put(open_file);
1227 return pntsd; 1227 return pntsd;
1228} 1228}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 6a0d741159f0..652f5051be09 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -29,6 +29,7 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/highmem.h>
32 33
33/* 34/*
34 * Calculate and return the CIFS signature based on the mac key and SMB PDU. 35 * Calculate and return the CIFS signature based on the mac key and SMB PDU.
@@ -37,11 +38,13 @@
37 * the sequence number before this function is called. Also, this function 38 * the sequence number before this function is called. Also, this function
38 * should be called with the server->srv_mutex held. 39 * should be called with the server->srv_mutex held.
39 */ 40 */
40static int cifs_calc_signature(const struct kvec *iov, int n_vec, 41static int cifs_calc_signature(struct smb_rqst *rqst,
41 struct TCP_Server_Info *server, char *signature) 42 struct TCP_Server_Info *server, char *signature)
42{ 43{
43 int i; 44 int i;
44 int rc; 45 int rc;
46 struct kvec *iov = rqst->rq_iov;
47 int n_vec = rqst->rq_nvec;
45 48
46 if (iov == NULL || signature == NULL || server == NULL) 49 if (iov == NULL || signature == NULL || server == NULL)
47 return -EINVAL; 50 return -EINVAL;
@@ -91,6 +94,16 @@ static int cifs_calc_signature(const struct kvec *iov, int n_vec,
91 } 94 }
92 } 95 }
93 96
97 /* now hash over the rq_pages array */
98 for (i = 0; i < rqst->rq_npages; i++) {
99 struct kvec p_iov;
100
101 cifs_rqst_page_to_kvec(rqst, i, &p_iov);
102 crypto_shash_update(&server->secmech.sdescmd5->shash,
103 p_iov.iov_base, p_iov.iov_len);
104 kunmap(rqst->rq_pages[i]);
105 }
106
94 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); 107 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
95 if (rc) 108 if (rc)
96 cERROR(1, "%s: Could not generate md5 hash", __func__); 109 cERROR(1, "%s: Could not generate md5 hash", __func__);
@@ -99,12 +112,12 @@ static int cifs_calc_signature(const struct kvec *iov, int n_vec,
99} 112}
100 113
101/* must be called with server->srv_mutex held */ 114/* must be called with server->srv_mutex held */
102int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 115int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
103 __u32 *pexpected_response_sequence_number) 116 __u32 *pexpected_response_sequence_number)
104{ 117{
105 int rc = 0; 118 int rc = 0;
106 char smb_signature[20]; 119 char smb_signature[20];
107 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; 120 struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
108 121
109 if ((cifs_pdu == NULL) || (server == NULL)) 122 if ((cifs_pdu == NULL) || (server == NULL))
110 return -EINVAL; 123 return -EINVAL;
@@ -125,7 +138,7 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
125 *pexpected_response_sequence_number = server->sequence_number++; 138 *pexpected_response_sequence_number = server->sequence_number++;
126 server->sequence_number++; 139 server->sequence_number++;
127 140
128 rc = cifs_calc_signature(iov, n_vec, server, smb_signature); 141 rc = cifs_calc_signature(rqst, server, smb_signature);
129 if (rc) 142 if (rc)
130 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 143 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
131 else 144 else
@@ -134,6 +147,15 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
134 return rc; 147 return rc;
135} 148}
136 149
150int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
151 __u32 *pexpected_response_sequence)
152{
153 struct smb_rqst rqst = { .rq_iov = iov,
154 .rq_nvec = n_vec };
155
156 return cifs_sign_rqst(&rqst, server, pexpected_response_sequence);
157}
158
137/* must be called with server->srv_mutex held */ 159/* must be called with server->srv_mutex held */
138int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, 160int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
139 __u32 *pexpected_response_sequence_number) 161 __u32 *pexpected_response_sequence_number)
@@ -147,14 +169,14 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
147 pexpected_response_sequence_number); 169 pexpected_response_sequence_number);
148} 170}
149 171
150int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, 172int cifs_verify_signature(struct smb_rqst *rqst,
151 struct TCP_Server_Info *server, 173 struct TCP_Server_Info *server,
152 __u32 expected_sequence_number) 174 __u32 expected_sequence_number)
153{ 175{
154 unsigned int rc; 176 unsigned int rc;
155 char server_response_sig[8]; 177 char server_response_sig[8];
156 char what_we_think_sig_should_be[20]; 178 char what_we_think_sig_should_be[20];
157 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; 179 struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
158 180
159 if (cifs_pdu == NULL || server == NULL) 181 if (cifs_pdu == NULL || server == NULL)
160 return -EINVAL; 182 return -EINVAL;
@@ -186,8 +208,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
186 cifs_pdu->Signature.Sequence.Reserved = 0; 208 cifs_pdu->Signature.Sequence.Reserved = 0;
187 209
188 mutex_lock(&server->srv_mutex); 210 mutex_lock(&server->srv_mutex);
189 rc = cifs_calc_signature(iov, nr_iov, server, 211 rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be);
190 what_we_think_sig_should_be);
191 mutex_unlock(&server->srv_mutex); 212 mutex_unlock(&server->srv_mutex);
192 213
193 if (rc) 214 if (rc)
@@ -686,12 +707,17 @@ calc_seckey(struct cifs_ses *ses)
686void 707void
687cifs_crypto_shash_release(struct TCP_Server_Info *server) 708cifs_crypto_shash_release(struct TCP_Server_Info *server)
688{ 709{
710 if (server->secmech.hmacsha256)
711 crypto_free_shash(server->secmech.hmacsha256);
712
689 if (server->secmech.md5) 713 if (server->secmech.md5)
690 crypto_free_shash(server->secmech.md5); 714 crypto_free_shash(server->secmech.md5);
691 715
692 if (server->secmech.hmacmd5) 716 if (server->secmech.hmacmd5)
693 crypto_free_shash(server->secmech.hmacmd5); 717 crypto_free_shash(server->secmech.hmacmd5);
694 718
719 kfree(server->secmech.sdeschmacsha256);
720
695 kfree(server->secmech.sdeschmacmd5); 721 kfree(server->secmech.sdeschmacmd5);
696 722
697 kfree(server->secmech.sdescmd5); 723 kfree(server->secmech.sdescmd5);
@@ -716,6 +742,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
716 goto crypto_allocate_md5_fail; 742 goto crypto_allocate_md5_fail;
717 } 743 }
718 744
745 server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
746 if (IS_ERR(server->secmech.hmacsha256)) {
747 cERROR(1, "could not allocate crypto hmacsha256\n");
748 rc = PTR_ERR(server->secmech.hmacsha256);
749 goto crypto_allocate_hmacsha256_fail;
750 }
751
719 size = sizeof(struct shash_desc) + 752 size = sizeof(struct shash_desc) +
720 crypto_shash_descsize(server->secmech.hmacmd5); 753 crypto_shash_descsize(server->secmech.hmacmd5);
721 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); 754 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
@@ -727,7 +760,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
727 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; 760 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
728 server->secmech.sdeschmacmd5->shash.flags = 0x0; 761 server->secmech.sdeschmacmd5->shash.flags = 0x0;
729 762
730
731 size = sizeof(struct shash_desc) + 763 size = sizeof(struct shash_desc) +
732 crypto_shash_descsize(server->secmech.md5); 764 crypto_shash_descsize(server->secmech.md5);
733 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); 765 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
@@ -739,12 +771,29 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
739 server->secmech.sdescmd5->shash.tfm = server->secmech.md5; 771 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
740 server->secmech.sdescmd5->shash.flags = 0x0; 772 server->secmech.sdescmd5->shash.flags = 0x0;
741 773
774 size = sizeof(struct shash_desc) +
775 crypto_shash_descsize(server->secmech.hmacsha256);
776 server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
777 if (!server->secmech.sdeschmacsha256) {
778 cERROR(1, "%s: Can't alloc hmacsha256\n", __func__);
779 rc = -ENOMEM;
780 goto crypto_allocate_hmacsha256_sdesc_fail;
781 }
782 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
783 server->secmech.sdeschmacsha256->shash.flags = 0x0;
784
742 return 0; 785 return 0;
743 786
787crypto_allocate_hmacsha256_sdesc_fail:
788 kfree(server->secmech.sdescmd5);
789
744crypto_allocate_md5_sdesc_fail: 790crypto_allocate_md5_sdesc_fail:
745 kfree(server->secmech.sdeschmacmd5); 791 kfree(server->secmech.sdeschmacmd5);
746 792
747crypto_allocate_hmacmd5_sdesc_fail: 793crypto_allocate_hmacmd5_sdesc_fail:
794 crypto_free_shash(server->secmech.hmacsha256);
795
796crypto_allocate_hmacsha256_fail:
748 crypto_free_shash(server->secmech.md5); 797 crypto_free_shash(server->secmech.md5);
749 798
750crypto_allocate_md5_fail: 799crypto_allocate_md5_fail:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index db8a404a51dd..e7931cc55d0c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -36,6 +36,7 @@
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/random.h>
39#include <net/ipv6.h> 40#include <net/ipv6.h>
40#include "cifsfs.h" 41#include "cifsfs.h"
41#include "cifspdu.h" 42#include "cifspdu.h"
@@ -51,7 +52,6 @@
51#ifdef CONFIG_CIFS_SMB2 52#ifdef CONFIG_CIFS_SMB2
52#include "smb2pdu.h" 53#include "smb2pdu.h"
53#endif 54#endif
54#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
57int cifsERROR = 1; 57int cifsERROR = 1;
@@ -89,6 +89,10 @@ extern mempool_t *cifs_mid_poolp;
89 89
90struct workqueue_struct *cifsiod_wq; 90struct workqueue_struct *cifsiod_wq;
91 91
92#ifdef CONFIG_CIFS_SMB2
93__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
94#endif
95
92static int 96static int
93cifs_read_super(struct super_block *sb) 97cifs_read_super(struct super_block *sb)
94{ 98{
@@ -160,13 +164,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
160 struct super_block *sb = dentry->d_sb; 164 struct super_block *sb = dentry->d_sb;
161 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 165 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
162 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); 166 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
163 int rc = -EOPNOTSUPP; 167 struct TCP_Server_Info *server = tcon->ses->server;
164 unsigned int xid; 168 unsigned int xid;
169 int rc = 0;
165 170
166 xid = get_xid(); 171 xid = get_xid();
167 172
168 buf->f_type = CIFS_MAGIC_NUMBER;
169
170 /* 173 /*
171 * PATH_MAX may be too long - it would presumably be total path, 174 * PATH_MAX may be too long - it would presumably be total path,
172 * but note that some servers (includinng Samba 3) have a shorter 175 * but note that some servers (includinng Samba 3) have a shorter
@@ -178,27 +181,8 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
178 buf->f_files = 0; /* undefined */ 181 buf->f_files = 0; /* undefined */
179 buf->f_ffree = 0; /* unlimited */ 182 buf->f_ffree = 0; /* unlimited */
180 183
181 /* 184 if (server->ops->queryfs)
182 * We could add a second check for a QFS Unix capability bit 185 rc = server->ops->queryfs(xid, tcon, buf);
183 */
184 if ((tcon->ses->capabilities & CAP_UNIX) &&
185 (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
186 rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
187
188 /*
189 * Only need to call the old QFSInfo if failed on newer one,
190 * e.g. by OS/2.
191 **/
192 if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
193 rc = CIFSSMBQFSInfo(xid, tcon, buf);
194
195 /*
196 * Some old Windows servers also do not support level 103, retry with
197 * older level one if old server failed the previous call or we
198 * bypassed it because we detected that this was an older LANMAN sess
199 */
200 if (rc)
201 rc = SMBOldQFSInfo(xid, tcon, buf);
202 186
203 free_xid(xid); 187 free_xid(xid);
204 return 0; 188 return 0;
@@ -239,9 +223,10 @@ cifs_alloc_inode(struct super_block *sb)
239 return NULL; 223 return NULL;
240 cifs_inode->cifsAttrs = 0x20; /* default */ 224 cifs_inode->cifsAttrs = 0x20; /* default */
241 cifs_inode->time = 0; 225 cifs_inode->time = 0;
242 /* Until the file is open and we have gotten oplock 226 /*
243 info back from the server, can not assume caching of 227 * Until the file is open and we have gotten oplock info back from the
244 file data or metadata */ 228 * server, can not assume caching of file data or metadata.
229 */
245 cifs_set_oplock_level(cifs_inode, 0); 230 cifs_set_oplock_level(cifs_inode, 0);
246 cifs_inode->delete_pending = false; 231 cifs_inode->delete_pending = false;
247 cifs_inode->invalid_mapping = false; 232 cifs_inode->invalid_mapping = false;
@@ -249,11 +234,16 @@ cifs_alloc_inode(struct super_block *sb)
249 cifs_inode->server_eof = 0; 234 cifs_inode->server_eof = 0;
250 cifs_inode->uniqueid = 0; 235 cifs_inode->uniqueid = 0;
251 cifs_inode->createtime = 0; 236 cifs_inode->createtime = 0;
252 237#ifdef CONFIG_CIFS_SMB2
253 /* Can not set i_flags here - they get immediately overwritten 238 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
254 to zero by the VFS */ 239#endif
255/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/ 240 /*
241 * Can not set i_flags here - they get immediately overwritten to zero
242 * by the VFS.
243 */
244 /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
256 INIT_LIST_HEAD(&cifs_inode->openFileList); 245 INIT_LIST_HEAD(&cifs_inode->openFileList);
246 INIT_LIST_HEAD(&cifs_inode->llist);
257 return &cifs_inode->vfs_inode; 247 return &cifs_inode->vfs_inode;
258} 248}
259 249
@@ -360,7 +350,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
360 cifs_show_security(s, tcon->ses->server); 350 cifs_show_security(s, tcon->ses->server);
361 cifs_show_cache_flavor(s, cifs_sb); 351 cifs_show_cache_flavor(s, cifs_sb);
362 352
363 seq_printf(s, ",unc=%s", tcon->treeName); 353 seq_printf(s, ",unc=");
354 seq_escape(s, tcon->treeName, " \t\n\\");
364 355
365 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 356 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
366 seq_printf(s, ",multiuser"); 357 seq_printf(s, ",multiuser");
@@ -957,7 +948,7 @@ cifs_init_once(void *inode)
957 struct cifsInodeInfo *cifsi = inode; 948 struct cifsInodeInfo *cifsi = inode;
958 949
959 inode_init_once(&cifsi->vfs_inode); 950 inode_init_once(&cifsi->vfs_inode);
960 mutex_init(&cifsi->lock_mutex); 951 init_rwsem(&cifsi->lock_sem);
961} 952}
962 953
963static int 954static int
@@ -977,6 +968,11 @@ cifs_init_inodecache(void)
977static void 968static void
978cifs_destroy_inodecache(void) 969cifs_destroy_inodecache(void)
979{ 970{
971 /*
972 * Make sure all delayed rcu free inodes are flushed before we
973 * destroy cache.
974 */
975 rcu_barrier();
980 kmem_cache_destroy(cifs_inode_cachep); 976 kmem_cache_destroy(cifs_inode_cachep);
981} 977}
982 978
@@ -1127,6 +1123,10 @@ init_cifs(void)
1127 spin_lock_init(&cifs_file_list_lock); 1123 spin_lock_init(&cifs_file_list_lock);
1128 spin_lock_init(&GlobalMid_Lock); 1124 spin_lock_init(&GlobalMid_Lock);
1129 1125
1126#ifdef CONFIG_CIFS_SMB2
1127 get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
1128#endif
1129
1130 if (cifs_max_pending < 2) { 1130 if (cifs_max_pending < 2) {
1131 cifs_max_pending = 2; 1131 cifs_max_pending = 2;
1132 cFYI(1, "cifs_max_pending set to min of 2"); 1132 cFYI(1, "cifs_max_pending set to min of 2");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 1c49c5a9b27a..7163419cecd9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -128,5 +128,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
128extern const struct export_operations cifs_export_ops; 128extern const struct export_operations cifs_export_ops;
129#endif /* CONFIG_CIFS_NFSD_EXPORT */ 129#endif /* CONFIG_CIFS_NFSD_EXPORT */
130 130
131#define CIFS_VERSION "1.78" 131#define CIFS_VERSION "2.0"
132#endif /* _CIFSFS_H */ 132#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 977dc0e85ccb..f5af2527fc69 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -32,6 +32,8 @@
32#include "smb2pdu.h" 32#include "smb2pdu.h"
33#endif 33#endif
34 34
35#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
36
35/* 37/*
36 * The sizes of various internal tables and strings 38 * The sizes of various internal tables and strings
37 */ 39 */
@@ -128,8 +130,10 @@ struct sdesc {
128struct cifs_secmech { 130struct cifs_secmech {
129 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ 131 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
130 struct crypto_shash *md5; /* md5 hash function */ 132 struct crypto_shash *md5; /* md5 hash function */
133 struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
131 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ 134 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
132 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ 135 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
136 struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
133}; 137};
134 138
135/* per smb session structure/fields */ 139/* per smb session structure/fields */
@@ -158,9 +162,24 @@ struct cifs_cred {
158 ***************************************************************** 162 *****************************************************************
159 */ 163 */
160 164
165/*
166 * A smb_rqst represents a complete request to be issued to a server. It's
167 * formed by a kvec array, followed by an array of pages. Page data is assumed
168 * to start at the beginning of the first page.
169 */
170struct smb_rqst {
171 struct kvec *rq_iov; /* array of kvecs */
172 unsigned int rq_nvec; /* number of kvecs in array */
173 struct page **rq_pages; /* pointer to array of page ptrs */
174 unsigned int rq_npages; /* number pages in array */
175 unsigned int rq_pagesz; /* page size to use */
176 unsigned int rq_tailsz; /* length of last page */
177};
178
161enum smb_version { 179enum smb_version {
162 Smb_1 = 1, 180 Smb_1 = 1,
163 Smb_21, 181 Smb_21,
182 Smb_30,
164}; 183};
165 184
166struct mid_q_entry; 185struct mid_q_entry;
@@ -171,17 +190,23 @@ struct cifs_tcon;
171struct dfs_info3_param; 190struct dfs_info3_param;
172struct cifs_fattr; 191struct cifs_fattr;
173struct smb_vol; 192struct smb_vol;
193struct cifs_fid;
194struct cifs_readdata;
195struct cifs_writedata;
196struct cifs_io_parms;
197struct cifs_search_info;
198struct cifsInodeInfo;
174 199
175struct smb_version_operations { 200struct smb_version_operations {
176 int (*send_cancel)(struct TCP_Server_Info *, void *, 201 int (*send_cancel)(struct TCP_Server_Info *, void *,
177 struct mid_q_entry *); 202 struct mid_q_entry *);
178 bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); 203 bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
179 /* setup request: allocate mid, sign message */ 204 /* setup request: allocate mid, sign message */
180 int (*setup_request)(struct cifs_ses *, struct kvec *, unsigned int, 205 struct mid_q_entry *(*setup_request)(struct cifs_ses *,
181 struct mid_q_entry **); 206 struct smb_rqst *);
182 /* setup async request: allocate mid, sign message */ 207 /* setup async request: allocate mid, sign message */
183 int (*setup_async_request)(struct TCP_Server_Info *, struct kvec *, 208 struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *,
184 unsigned int, struct mid_q_entry **); 209 struct smb_rqst *);
185 /* check response: verify signature, map error */ 210 /* check response: verify signature, map error */
186 int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *, 211 int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
187 bool); 212 bool);
@@ -212,6 +237,10 @@ struct smb_version_operations {
212 bool (*need_neg)(struct TCP_Server_Info *); 237 bool (*need_neg)(struct TCP_Server_Info *);
213 /* negotiate to the server */ 238 /* negotiate to the server */
214 int (*negotiate)(const unsigned int, struct cifs_ses *); 239 int (*negotiate)(const unsigned int, struct cifs_ses *);
240 /* set negotiated write size */
241 unsigned int (*negotiate_wsize)(struct cifs_tcon *, struct smb_vol *);
242 /* set negotiated read size */
243 unsigned int (*negotiate_rsize)(struct cifs_tcon *, struct smb_vol *);
215 /* setup smb sessionn */ 244 /* setup smb sessionn */
216 int (*sess_setup)(const unsigned int, struct cifs_ses *, 245 int (*sess_setup)(const unsigned int, struct cifs_ses *,
217 const struct nls_table *); 246 const struct nls_table *);
@@ -235,10 +264,22 @@ struct smb_version_operations {
235 int (*query_path_info)(const unsigned int, struct cifs_tcon *, 264 int (*query_path_info)(const unsigned int, struct cifs_tcon *,
236 struct cifs_sb_info *, const char *, 265 struct cifs_sb_info *, const char *,
237 FILE_ALL_INFO *, bool *); 266 FILE_ALL_INFO *, bool *);
267 /* query file data from the server */
268 int (*query_file_info)(const unsigned int, struct cifs_tcon *,
269 struct cifs_fid *, FILE_ALL_INFO *);
238 /* get server index number */ 270 /* get server index number */
239 int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, 271 int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
240 struct cifs_sb_info *, const char *, 272 struct cifs_sb_info *, const char *,
241 u64 *uniqueid, FILE_ALL_INFO *); 273 u64 *uniqueid, FILE_ALL_INFO *);
274 /* set size by path */
275 int (*set_path_size)(const unsigned int, struct cifs_tcon *,
276 const char *, __u64, struct cifs_sb_info *, bool);
277 /* set size by file handle */
278 int (*set_file_size)(const unsigned int, struct cifs_tcon *,
279 struct cifsFileInfo *, __u64, bool);
280 /* set attributes */
281 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
282 const unsigned int);
242 /* build a full path to the root of the mount */ 283 /* build a full path to the root of the mount */
243 char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *, 284 char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
244 struct cifs_tcon *); 285 struct cifs_tcon *);
@@ -256,10 +297,84 @@ struct smb_version_operations {
256 /* remove directory */ 297 /* remove directory */
257 int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *, 298 int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *,
258 struct cifs_sb_info *); 299 struct cifs_sb_info *);
300 /* unlink file */
301 int (*unlink)(const unsigned int, struct cifs_tcon *, const char *,
302 struct cifs_sb_info *);
303 /* open, rename and delete file */
304 int (*rename_pending_delete)(const char *, struct dentry *,
305 const unsigned int);
306 /* send rename request */
307 int (*rename)(const unsigned int, struct cifs_tcon *, const char *,
308 const char *, struct cifs_sb_info *);
309 /* send create hardlink request */
310 int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
311 const char *, const char *,
312 struct cifs_sb_info *);
313 /* open a file for non-posix mounts */
314 int (*open)(const unsigned int, struct cifs_tcon *, const char *, int,
315 int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *,
316 struct cifs_sb_info *);
317 /* set fid protocol-specific info */
318 void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
319 /* close a file */
320 void (*close)(const unsigned int, struct cifs_tcon *,
321 struct cifs_fid *);
322 /* send a flush request to the server */
323 int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
324 /* async read from the server */
325 int (*async_readv)(struct cifs_readdata *);
326 /* async write to the server */
327 int (*async_writev)(struct cifs_writedata *);
328 /* sync read from the server */
329 int (*sync_read)(const unsigned int, struct cifsFileInfo *,
330 struct cifs_io_parms *, unsigned int *, char **,
331 int *);
332 /* sync write to the server */
333 int (*sync_write)(const unsigned int, struct cifsFileInfo *,
334 struct cifs_io_parms *, unsigned int *, struct kvec *,
335 unsigned long);
336 /* open dir, start readdir */
337 int (*query_dir_first)(const unsigned int, struct cifs_tcon *,
338 const char *, struct cifs_sb_info *,
339 struct cifs_fid *, __u16,
340 struct cifs_search_info *);
341 /* continue readdir */
342 int (*query_dir_next)(const unsigned int, struct cifs_tcon *,
343 struct cifs_fid *,
344 __u16, struct cifs_search_info *srch_inf);
345 /* close dir */
346 int (*close_dir)(const unsigned int, struct cifs_tcon *,
347 struct cifs_fid *);
348 /* calculate a size of SMB message */
349 unsigned int (*calc_smb_size)(void *);
350 /* check for STATUS_PENDING and process it in a positive case */
351 bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
352 /* send oplock break response */
353 int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *,
354 struct cifsInodeInfo *);
355 /* query remote filesystem */
356 int (*queryfs)(const unsigned int, struct cifs_tcon *,
357 struct kstatfs *);
358 /* send mandatory brlock to the server */
359 int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64,
360 __u64, __u32, int, int, bool);
361 /* unlock range of mandatory locks */
362 int (*mand_unlock_range)(struct cifsFileInfo *, struct file_lock *,
363 const unsigned int);
364 /* push brlocks from the cache to the server */
365 int (*push_mand_locks)(struct cifsFileInfo *);
366 /* get lease key of the inode */
367 void (*get_lease_key)(struct inode *, struct cifs_fid *fid);
368 /* set lease key of the inode */
369 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
370 /* generate new lease key */
371 void (*new_lease_key)(struct cifs_fid *fid);
259}; 372};
260 373
261struct smb_version_values { 374struct smb_version_values {
262 char *version_string; 375 char *version_string;
376 __u16 protocol_id;
377 __u32 req_capabilities;
263 __u32 large_lock_type; 378 __u32 large_lock_type;
264 __u32 exclusive_lock_type; 379 __u32 exclusive_lock_type;
265 __u32 shared_lock_type; 380 __u32 shared_lock_type;
@@ -496,6 +611,51 @@ get_next_mid(struct TCP_Server_Info *server)
496} 611}
497 612
498/* 613/*
614 * When the server supports very large reads and writes via POSIX extensions,
615 * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
616 * including the RFC1001 length.
617 *
618 * Note that this might make for "interesting" allocation problems during
619 * writeback however as we have to allocate an array of pointers for the
620 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
621 *
622 * For reads, there is a similar problem as we need to allocate an array
623 * of kvecs to handle the receive, though that should only need to be done
624 * once.
625 */
626#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
627#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
628
629/*
630 * When the server doesn't allow large posix writes, only allow a rsize/wsize
631 * of 2^17-1 minus the size of the call header. That allows for a read or
632 * write up to the maximum size described by RFC1002.
633 */
634#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
635#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
636
637/*
638 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
639 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
640 * a single wsize request with a single call.
641 */
642#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
643
644/*
645 * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
646 * those values when posix extensions aren't in force. In actuality here, we
647 * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
648 * to be ok with the extra byte even though Windows doesn't send writes that
649 * are that large.
650 *
651 * Citation:
652 *
653 * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
654 */
655#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
656#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
657
658/*
499 * Macros to allow the TCP_Server_Info->net field and related code to drop out 659 * Macros to allow the TCP_Server_Info->net field and related code to drop out
500 * when CONFIG_NET_NS isn't set. 660 * when CONFIG_NET_NS isn't set.
501 */ 661 */
@@ -559,6 +719,7 @@ struct cifs_ses {
559 __u16 session_flags; 719 __u16 session_flags;
560#endif /* CONFIG_CIFS_SMB2 */ 720#endif /* CONFIG_CIFS_SMB2 */
561}; 721};
722
562/* no more than one of the following three session flags may be set */ 723/* no more than one of the following three session flags may be set */
563#define CIFS_SES_NT4 1 724#define CIFS_SES_NT4 1
564#define CIFS_SES_OS2 2 725#define CIFS_SES_OS2 2
@@ -665,6 +826,7 @@ struct cifs_tcon {
665 u64 resource_id; /* server resource id */ 826 u64 resource_id; /* server resource id */
666 struct fscache_cookie *fscache; /* cookie for share */ 827 struct fscache_cookie *fscache; /* cookie for share */
667#endif 828#endif
829 struct list_head pending_opens; /* list of incomplete opens */
668 /* BB add field for back pointer to sb struct(s)? */ 830 /* BB add field for back pointer to sb struct(s)? */
669}; 831};
670 832
@@ -707,6 +869,15 @@ cifs_get_tlink(struct tcon_link *tlink)
707/* This function is always expected to succeed */ 869/* This function is always expected to succeed */
708extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); 870extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
709 871
872#define CIFS_OPLOCK_NO_CHANGE 0xfe
873
874struct cifs_pending_open {
875 struct list_head olist;
876 struct tcon_link *tlink;
877 __u8 lease_key[16];
878 __u32 oplock;
879};
880
710/* 881/*
711 * This info hangs off the cifsFileInfo structure, pointed to by llist. 882 * This info hangs off the cifsFileInfo structure, pointed to by llist.
712 * This is used to track byte stream locks on the file 883 * This is used to track byte stream locks on the file
@@ -740,16 +911,29 @@ struct cifs_search_info {
740 bool smallBuf:1; /* so we know which buf_release function to call */ 911 bool smallBuf:1; /* so we know which buf_release function to call */
741}; 912};
742 913
914struct cifs_fid {
915 __u16 netfid;
916#ifdef CONFIG_CIFS_SMB2
917 __u64 persistent_fid; /* persist file id for smb2 */
918 __u64 volatile_fid; /* volatile file id for smb2 */
919 __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
920#endif
921 struct cifs_pending_open *pending_open;
922};
923
924struct cifs_fid_locks {
925 struct list_head llist;
926 struct cifsFileInfo *cfile; /* fid that owns locks */
927 struct list_head locks; /* locks held by fid above */
928};
929
743struct cifsFileInfo { 930struct cifsFileInfo {
744 struct list_head tlist; /* pointer to next fid owned by tcon */ 931 struct list_head tlist; /* pointer to next fid owned by tcon */
745 struct list_head flist; /* next fid (file instance) for this inode */ 932 struct list_head flist; /* next fid (file instance) for this inode */
746 struct list_head llist; /* 933 struct cifs_fid_locks *llist; /* brlocks held by this fid */
747 * brlocks held by this fid, protected by
748 * lock_mutex from cifsInodeInfo structure
749 */
750 unsigned int uid; /* allows finding which FileInfo structure */ 934 unsigned int uid; /* allows finding which FileInfo structure */
751 __u32 pid; /* process id who opened file */ 935 __u32 pid; /* process id who opened file */
752 __u16 netfid; /* file id from remote */ 936 struct cifs_fid fid; /* file id from remote */
753 /* BB add lock scope info here if needed */ ; 937 /* BB add lock scope info here if needed */ ;
754 /* lock scope id (0 if none) */ 938 /* lock scope id (0 if none) */
755 struct dentry *dentry; 939 struct dentry *dentry;
@@ -765,12 +949,60 @@ struct cifsFileInfo {
765 949
766struct cifs_io_parms { 950struct cifs_io_parms {
767 __u16 netfid; 951 __u16 netfid;
952#ifdef CONFIG_CIFS_SMB2
953 __u64 persistent_fid; /* persist file id for smb2 */
954 __u64 volatile_fid; /* volatile file id for smb2 */
955#endif
768 __u32 pid; 956 __u32 pid;
769 __u64 offset; 957 __u64 offset;
770 unsigned int length; 958 unsigned int length;
771 struct cifs_tcon *tcon; 959 struct cifs_tcon *tcon;
772}; 960};
773 961
962struct cifs_readdata;
963
964/* asynchronous read support */
965struct cifs_readdata {
966 struct kref refcount;
967 struct list_head list;
968 struct completion done;
969 struct cifsFileInfo *cfile;
970 struct address_space *mapping;
971 __u64 offset;
972 unsigned int bytes;
973 pid_t pid;
974 int result;
975 struct work_struct work;
976 int (*read_into_pages)(struct TCP_Server_Info *server,
977 struct cifs_readdata *rdata,
978 unsigned int len);
979 struct kvec iov;
980 unsigned int pagesz;
981 unsigned int tailsz;
982 unsigned int nr_pages;
983 struct page *pages[];
984};
985
986struct cifs_writedata;
987
988/* asynchronous write support */
989struct cifs_writedata {
990 struct kref refcount;
991 struct list_head list;
992 struct completion done;
993 enum writeback_sync_modes sync_mode;
994 struct work_struct work;
995 struct cifsFileInfo *cfile;
996 __u64 offset;
997 pid_t pid;
998 unsigned int bytes;
999 int result;
1000 unsigned int pagesz;
1001 unsigned int tailsz;
1002 unsigned int nr_pages;
1003 struct page *pages[1];
1004};
1005
774/* 1006/*
775 * Take a reference on the file private data. Must be called with 1007 * Take a reference on the file private data. Must be called with
776 * cifs_file_list_lock held. 1008 * cifs_file_list_lock held.
@@ -790,11 +1022,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
790 1022
791struct cifsInodeInfo { 1023struct cifsInodeInfo {
792 bool can_cache_brlcks; 1024 bool can_cache_brlcks;
793 struct mutex lock_mutex; /* 1025 struct list_head llist; /* locks helb by this inode */
794 * protect the field above and llist 1026 struct rw_semaphore lock_sem; /* protect the fields above */
795 * from every cifsFileInfo structure
796 * from openFileList
797 */
798 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 1027 /* BB add in lists for dirty pages i.e. write caching info for oplock */
799 struct list_head openFileList; 1028 struct list_head openFileList;
800 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 1029 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -806,6 +1035,9 @@ struct cifsInodeInfo {
806 u64 server_eof; /* current file size on server -- protected by i_lock */ 1035 u64 server_eof; /* current file size on server -- protected by i_lock */
807 u64 uniqueid; /* server inode number */ 1036 u64 uniqueid; /* server inode number */
808 u64 createtime; /* creation time on server */ 1037 u64 createtime; /* creation time on server */
1038#ifdef CONFIG_CIFS_SMB2
1039 __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
1040#endif
809#ifdef CONFIG_CIFS_FSCACHE 1041#ifdef CONFIG_CIFS_FSCACHE
810 struct fscache_cookie *fscache; 1042 struct fscache_cookie *fscache;
811#endif 1043#endif
@@ -1130,7 +1362,7 @@ require use of the stronger protocol */
1130#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1362#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1131#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1363#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1132 1364
1133#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) 1365#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
1134#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1366#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1135#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1367#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1136/* 1368/*
@@ -1267,7 +1499,13 @@ extern mempool_t *cifs_mid_poolp;
1267#define SMB1_VERSION_STRING "1.0" 1499#define SMB1_VERSION_STRING "1.0"
1268extern struct smb_version_operations smb1_operations; 1500extern struct smb_version_operations smb1_operations;
1269extern struct smb_version_values smb1_values; 1501extern struct smb_version_values smb1_values;
1502#define SMB20_VERSION_STRING "2.0"
1503/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */
1504extern struct smb_version_values smb20_values;
1270#define SMB21_VERSION_STRING "2.1" 1505#define SMB21_VERSION_STRING "2.1"
1271extern struct smb_version_operations smb21_operations; 1506extern struct smb_version_operations smb21_operations;
1272extern struct smb_version_values smb21_values; 1507extern struct smb_version_values smb21_values;
1508#define SMB30_VERSION_STRING "3.0"
1509/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */
1510extern struct smb_version_values smb30_values;
1273#endif /* _CIFS_GLOB_H */ 1511#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3fb03e2c8e86..b9d59a948a2c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2210,7 +2210,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
2210 __u8 DeletePending; 2210 __u8 DeletePending;
2211 __u8 Directory; 2211 __u8 Directory;
2212 __u16 Pad2; 2212 __u16 Pad2;
2213 __u64 IndexNumber; 2213 __le64 IndexNumber;
2214 __le32 EASize; 2214 __le32 EASize;
2215 __le32 AccessFlags; 2215 __le32 AccessFlags;
2216 __u64 IndexNumber1; 2216 __u64 IndexNumber1;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f1bbf8305d3a..5144e9fbeb8c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -24,6 +24,7 @@
24 24
25struct statfs; 25struct statfs;
26struct smb_vol; 26struct smb_vol;
27struct smb_rqst;
27 28
28/* 29/*
29 ***************************************************************** 30 *****************************************************************
@@ -35,6 +36,8 @@ extern struct smb_hdr *cifs_buf_get(void);
35extern void cifs_buf_release(void *); 36extern void cifs_buf_release(void *);
36extern struct smb_hdr *cifs_small_buf_get(void); 37extern struct smb_hdr *cifs_small_buf_get(void);
37extern void cifs_small_buf_release(void *); 38extern void cifs_small_buf_release(void *);
39extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
40 struct kvec *iov);
38extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, 41extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */); 42 unsigned int /* length */);
40extern unsigned int _get_xid(void); 43extern unsigned int _get_xid(void);
@@ -65,21 +68,22 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
65extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, 68extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
66 struct TCP_Server_Info *server); 69 struct TCP_Server_Info *server);
67extern void DeleteMidQEntry(struct mid_q_entry *midEntry); 70extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
71extern void cifs_delete_mid(struct mid_q_entry *mid);
68extern void cifs_wake_up_task(struct mid_q_entry *mid); 72extern void cifs_wake_up_task(struct mid_q_entry *mid);
69extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 73extern int cifs_call_async(struct TCP_Server_Info *server,
70 unsigned int nvec, mid_receive_t *receive, 74 struct smb_rqst *rqst,
71 mid_callback_t *callback, void *cbdata, 75 mid_receive_t *receive, mid_callback_t *callback,
72 const int flags); 76 void *cbdata, const int flags);
73extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, 77extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
74 struct smb_hdr * /* input */ , 78 struct smb_hdr * /* input */ ,
75 struct smb_hdr * /* out */ , 79 struct smb_hdr * /* out */ ,
76 int * /* bytes returned */ , const int); 80 int * /* bytes returned */ , const int);
77extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, 81extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
78 char *in_buf, int flags); 82 char *in_buf, int flags);
79extern int cifs_setup_request(struct cifs_ses *, struct kvec *, unsigned int, 83extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
80 struct mid_q_entry **); 84 struct smb_rqst *);
81extern int cifs_setup_async_request(struct TCP_Server_Info *, struct kvec *, 85extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
82 unsigned int, struct mid_q_entry **); 86 struct smb_rqst *);
83extern int cifs_check_receive(struct mid_q_entry *mid, 87extern int cifs_check_receive(struct mid_q_entry *mid,
84 struct TCP_Server_Info *server, bool log_error); 88 struct TCP_Server_Info *server, bool log_error);
85extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, 89extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
@@ -99,7 +103,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
99 unsigned int bytes_written); 103 unsigned int bytes_written);
100extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); 104extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
101extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); 105extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
102extern unsigned int smbCalcSize(struct smb_hdr *ptr); 106extern unsigned int smbCalcSize(void *buf);
103extern int decode_negTokenInit(unsigned char *security_blob, int length, 107extern int decode_negTokenInit(unsigned char *security_blob, int length,
104 struct TCP_Server_Info *server); 108 struct TCP_Server_Info *server);
105extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); 109extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -120,10 +124,14 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
120extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 124extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
121 int offset); 125 int offset);
122extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); 126extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
123 127extern int cifs_unlock_range(struct cifsFileInfo *cfile,
124extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, 128 struct file_lock *flock, const unsigned int xid);
125 struct file *file, struct tcon_link *tlink, 129extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
126 __u32 oplock); 130
131extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
132 struct file *file,
133 struct tcon_link *tlink,
134 __u32 oplock);
127extern int cifs_posix_open(char *full_path, struct inode **inode, 135extern int cifs_posix_open(char *full_path, struct inode **inode,
128 struct super_block *sb, int mode, 136 struct super_block *sb, int mode,
129 unsigned int f_flags, __u32 *oplock, __u16 *netfid, 137 unsigned int f_flags, __u32 *oplock, __u16 *netfid,
@@ -132,18 +140,23 @@ void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
132extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 140extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
133 FILE_UNIX_BASIC_INFO *info, 141 FILE_UNIX_BASIC_INFO *info,
134 struct cifs_sb_info *cifs_sb); 142 struct cifs_sb_info *cifs_sb);
143extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
144 struct cifs_sb_info *);
135extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); 145extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
136extern struct inode *cifs_iget(struct super_block *sb, 146extern struct inode *cifs_iget(struct super_block *sb,
137 struct cifs_fattr *fattr); 147 struct cifs_fattr *fattr);
138 148
139extern int cifs_get_file_info(struct file *filp);
140extern int cifs_get_inode_info(struct inode **inode, const char *full_path, 149extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
141 FILE_ALL_INFO *data, struct super_block *sb, 150 FILE_ALL_INFO *data, struct super_block *sb,
142 int xid, const __u16 *fid); 151 int xid, const __u16 *fid);
143extern int cifs_get_file_info_unix(struct file *filp);
144extern int cifs_get_inode_info_unix(struct inode **pinode, 152extern int cifs_get_inode_info_unix(struct inode **pinode,
145 const unsigned char *search_path, 153 const unsigned char *search_path,
146 struct super_block *sb, unsigned int xid); 154 struct super_block *sb, unsigned int xid);
155extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
156 unsigned int xid, char *full_path, __u32 dosattr);
157extern int cifs_rename_pending_delete(const char *full_path,
158 struct dentry *dentry,
159 const unsigned int xid);
147extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 160extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
148 struct cifs_fattr *fattr, struct inode *inode, 161 struct cifs_fattr *fattr, struct inode *inode,
149 const char *path, const __u16 *pfid); 162 const char *path, const __u16 *pfid);
@@ -169,6 +182,17 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,
169extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); 182extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
170extern void cifs_umount(struct cifs_sb_info *); 183extern void cifs_umount(struct cifs_sb_info *);
171extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); 184extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
186 __u64 length, __u8 type,
187 struct cifsLockInfo **conf_lock,
188 bool rw_check);
189extern void cifs_add_pending_open(struct cifs_fid *fid,
190 struct tcon_link *tlink,
191 struct cifs_pending_open *open);
192extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
193 struct tcon_link *tlink,
194 struct cifs_pending_open *open);
195extern void cifs_del_pending_open(struct cifs_pending_open *open);
172 196
173#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) 197#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
174extern void cifs_dfs_release_automount_timer(void); 198extern void cifs_dfs_release_automount_timer(void);
@@ -179,6 +203,10 @@ extern void cifs_dfs_release_automount_timer(void);
179void cifs_proc_init(void); 203void cifs_proc_init(void);
180void cifs_proc_clean(void); 204void cifs_proc_clean(void);
181 205
206extern void cifs_move_llist(struct list_head *source, struct list_head *dest);
207extern void cifs_free_llist(struct list_head *llist);
208extern void cifs_del_lock_waiters(struct cifsLockInfo *lock);
209
182extern int cifs_negotiate_protocol(const unsigned int xid, 210extern int cifs_negotiate_protocol(const unsigned int xid,
183 struct cifs_ses *ses); 211 struct cifs_ses *ses);
184extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, 212extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
@@ -190,10 +218,10 @@ extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
190 const struct nls_table *); 218 const struct nls_table *);
191 219
192extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, 220extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
193 const char *searchName, const struct nls_table *nls_codepage, 221 const char *searchName, struct cifs_sb_info *cifs_sb,
194 __u16 *searchHandle, __u16 search_flags, 222 __u16 *searchHandle, __u16 search_flags,
195 struct cifs_search_info *psrch_inf, 223 struct cifs_search_info *psrch_inf,
196 int map, const char dirsep); 224 bool msearch);
197 225
198extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, 226extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
199 __u16 searchHandle, __u16 search_flags, 227 __u16 searchHandle, __u16 search_flags,
@@ -265,13 +293,11 @@ extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon,
265 const struct nls_table *nls_codepage); 293 const struct nls_table *nls_codepage);
266#endif /* possibly unneeded function */ 294#endif /* possibly unneeded function */
267extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, 295extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
268 const char *fileName, __u64 size, 296 const char *file_name, __u64 size,
269 bool setAllocationSizeFlag, 297 struct cifs_sb_info *cifs_sb, bool set_allocation);
270 const struct nls_table *nls_codepage,
271 int remap_special_chars);
272extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, 298extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
273 __u64 size, __u16 fileHandle, __u32 opener_pid, 299 struct cifsFileInfo *cfile, __u64 size,
274 bool AllocSizeFlag); 300 bool set_allocation);
275 301
276struct cifs_unix_set_info_args { 302struct cifs_unix_set_info_args {
277 __u64 ctime; 303 __u64 ctime;
@@ -303,22 +329,17 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
303 const struct nls_table *nls_codepage, 329 const struct nls_table *nls_codepage,
304 int remap_special_chars); 330 int remap_special_chars);
305extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, 331extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
306 const char *name, 332 const char *name, struct cifs_sb_info *cifs_sb);
307 const struct nls_table *nls_codepage,
308 int remap_special_chars);
309extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, 333extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
310 const char *fromName, const char *toName, 334 const char *from_name, const char *to_name,
311 const struct nls_table *nls_codepage, 335 struct cifs_sb_info *cifs_sb);
312 int remap_special_chars);
313extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon, 336extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon,
314 int netfid, const char *target_name, 337 int netfid, const char *target_name,
315 const struct nls_table *nls_codepage, 338 const struct nls_table *nls_codepage,
316 int remap_special_chars); 339 int remap_special_chars);
317extern int CIFSCreateHardLink(const unsigned int xid, 340extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
318 struct cifs_tcon *tcon, 341 const char *from_name, const char *to_name,
319 const char *fromName, const char *toName, 342 struct cifs_sb_info *cifs_sb);
320 const struct nls_table *nls_codepage,
321 int remap_special_chars);
322extern int CIFSUnixCreateHardLink(const unsigned int xid, 343extern int CIFSUnixCreateHardLink(const unsigned int xid,
323 struct cifs_tcon *tcon, 344 struct cifs_tcon *tcon,
324 const char *fromName, const char *toName, 345 const char *fromName, const char *toName,
@@ -367,8 +388,7 @@ extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
367 unsigned int *nbytes, const char *buf, 388 unsigned int *nbytes, const char *buf,
368 const char __user *ubuf, const int long_op); 389 const char __user *ubuf, const int long_op);
369extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, 390extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
370 unsigned int *nbytes, struct kvec *iov, const int nvec, 391 unsigned int *nbytes, struct kvec *iov, const int nvec);
371 const int long_op);
372extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, 392extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
373 const char *search_name, __u64 *inode_number, 393 const char *search_name, __u64 *inode_number,
374 const struct nls_table *nls_codepage, 394 const struct nls_table *nls_codepage,
@@ -397,10 +417,12 @@ extern void sesInfoFree(struct cifs_ses *);
397extern struct cifs_tcon *tconInfoAlloc(void); 417extern struct cifs_tcon *tconInfoAlloc(void);
398extern void tconInfoFree(struct cifs_tcon *); 418extern void tconInfoFree(struct cifs_tcon *);
399 419
400extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); 420extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
421 __u32 *pexpected_response_sequence_number);
401extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 422extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
402 __u32 *); 423 __u32 *);
403extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, 424extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
425extern int cifs_verify_signature(struct smb_rqst *rqst,
404 struct TCP_Server_Info *server, 426 struct TCP_Server_Info *server,
405 __u32 expected_sequence_number); 427 __u32 expected_sequence_number);
406extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, 428extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
@@ -462,45 +484,9 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
462extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, 484extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
463 unsigned char *p24); 485 unsigned char *p24);
464 486
465/* asynchronous read support */
466struct cifs_readdata {
467 struct kref refcount;
468 struct list_head list;
469 struct completion done;
470 struct cifsFileInfo *cfile;
471 struct address_space *mapping;
472 __u64 offset;
473 unsigned int bytes;
474 pid_t pid;
475 int result;
476 struct list_head pages;
477 struct work_struct work;
478 int (*marshal_iov) (struct cifs_readdata *rdata,
479 unsigned int remaining);
480 unsigned int nr_iov;
481 struct kvec iov[1];
482};
483
484void cifs_readdata_release(struct kref *refcount); 487void cifs_readdata_release(struct kref *refcount);
485int cifs_async_readv(struct cifs_readdata *rdata); 488int cifs_async_readv(struct cifs_readdata *rdata);
486 489int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
487/* asynchronous write support */
488struct cifs_writedata {
489 struct kref refcount;
490 struct list_head list;
491 struct completion done;
492 enum writeback_sync_modes sync_mode;
493 struct work_struct work;
494 struct cifsFileInfo *cfile;
495 __u64 offset;
496 pid_t pid;
497 unsigned int bytes;
498 int result;
499 void (*marshal_iov) (struct kvec *iov,
500 struct cifs_writedata *wdata);
501 unsigned int nr_pages;
502 struct page *pages[1];
503};
504 490
505int cifs_async_writev(struct cifs_writedata *wdata); 491int cifs_async_writev(struct cifs_writedata *wdata);
506void cifs_writev_complete(struct work_struct *work); 492void cifs_writev_complete(struct work_struct *work);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f0cf934ba877..76d0d2998850 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -86,32 +86,6 @@ static struct {
86#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 86#endif /* CONFIG_CIFS_WEAK_PW_HASH */
87#endif /* CIFS_POSIX */ 87#endif /* CIFS_POSIX */
88 88
89#ifdef CONFIG_HIGHMEM
90/*
91 * On arches that have high memory, kmap address space is limited. By
92 * serializing the kmap operations on those arches, we ensure that we don't
93 * end up with a bunch of threads in writeback with partially mapped page
94 * arrays, stuck waiting for kmap to come back. That situation prevents
95 * progress and can deadlock.
96 */
97static DEFINE_MUTEX(cifs_kmap_mutex);
98
99static inline void
100cifs_kmap_lock(void)
101{
102 mutex_lock(&cifs_kmap_mutex);
103}
104
105static inline void
106cifs_kmap_unlock(void)
107{
108 mutex_unlock(&cifs_kmap_mutex);
109}
110#else /* !CONFIG_HIGHMEM */
111#define cifs_kmap_lock() do { ; } while(0)
112#define cifs_kmap_unlock() do { ; } while(0)
113#endif /* CONFIG_HIGHMEM */
114
115/* 89/*
116 * Mark as invalid, all open files on tree connections since they 90 * Mark as invalid, all open files on tree connections since they
117 * were closed when session to server was lost. 91 * were closed when session to server was lost.
@@ -751,6 +725,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
751 ECHO_REQ *smb; 725 ECHO_REQ *smb;
752 int rc = 0; 726 int rc = 0;
753 struct kvec iov; 727 struct kvec iov;
728 struct smb_rqst rqst = { .rq_iov = &iov,
729 .rq_nvec = 1 };
754 730
755 cFYI(1, "In echo request"); 731 cFYI(1, "In echo request");
756 732
@@ -768,7 +744,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
768 iov.iov_base = smb; 744 iov.iov_base = smb;
769 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; 745 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
770 746
771 rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, 747 rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
772 server, CIFS_ASYNC_OP | CIFS_ECHO_OP); 748 server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
773 if (rc) 749 if (rc)
774 cFYI(1, "Echo request failed: %d", rc); 750 cFYI(1, "Echo request failed: %d", rc);
@@ -902,15 +878,15 @@ PsxDelete:
902} 878}
903 879
904int 880int
905CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, 881CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
906 const char *fileName, const struct nls_table *nls_codepage, 882 struct cifs_sb_info *cifs_sb)
907 int remap)
908{ 883{
909 DELETE_FILE_REQ *pSMB = NULL; 884 DELETE_FILE_REQ *pSMB = NULL;
910 DELETE_FILE_RSP *pSMBr = NULL; 885 DELETE_FILE_RSP *pSMBr = NULL;
911 int rc = 0; 886 int rc = 0;
912 int bytes_returned; 887 int bytes_returned;
913 int name_len; 888 int name_len;
889 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
914 890
915DelFileRetry: 891DelFileRetry:
916 rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, 892 rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
@@ -919,15 +895,15 @@ DelFileRetry:
919 return rc; 895 return rc;
920 896
921 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 897 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
922 name_len = 898 name_len = cifsConvertToUTF16((__le16 *) pSMB->fileName, name,
923 cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName, 899 PATH_MAX, cifs_sb->local_nls,
924 PATH_MAX, nls_codepage, remap); 900 remap);
925 name_len++; /* trailing null */ 901 name_len++; /* trailing null */
926 name_len *= 2; 902 name_len *= 2;
927 } else { /* BB improve check for buffer overruns BB */ 903 } else { /* BB improve check for buffer overruns BB */
928 name_len = strnlen(fileName, PATH_MAX); 904 name_len = strnlen(name, PATH_MAX);
929 name_len++; /* trailing null */ 905 name_len++; /* trailing null */
930 strncpy(pSMB->fileName, fileName, name_len); 906 strncpy(pSMB->fileName, name, name_len);
931 } 907 }
932 pSMB->SearchAttributes = 908 pSMB->SearchAttributes =
933 cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); 909 cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
@@ -1440,7 +1416,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1440 return 0; 1416 return 0;
1441} 1417}
1442 1418
1443static int 1419int
1444cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) 1420cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1445{ 1421{
1446 int length, len; 1422 int length, len;
@@ -1460,10 +1436,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1460 len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - 1436 len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
1461 HEADER_SIZE(server) + 1; 1437 HEADER_SIZE(server) + 1;
1462 1438
1463 rdata->iov[0].iov_base = buf + HEADER_SIZE(server) - 1; 1439 rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1;
1464 rdata->iov[0].iov_len = len; 1440 rdata->iov.iov_len = len;
1465 1441
1466 length = cifs_readv_from_socket(server, rdata->iov, 1, len); 1442 length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
1467 if (length < 0) 1443 if (length < 0)
1468 return length; 1444 return length;
1469 server->total_read += length; 1445 server->total_read += length;
@@ -1509,19 +1485,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1509 len = data_offset - server->total_read; 1485 len = data_offset - server->total_read;
1510 if (len > 0) { 1486 if (len > 0) {
1511 /* read any junk before data into the rest of smallbuf */ 1487 /* read any junk before data into the rest of smallbuf */
1512 rdata->iov[0].iov_base = buf + server->total_read; 1488 rdata->iov.iov_base = buf + server->total_read;
1513 rdata->iov[0].iov_len = len; 1489 rdata->iov.iov_len = len;
1514 length = cifs_readv_from_socket(server, rdata->iov, 1, len); 1490 length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
1515 if (length < 0) 1491 if (length < 0)
1516 return length; 1492 return length;
1517 server->total_read += length; 1493 server->total_read += length;
1518 } 1494 }
1519 1495
1520 /* set up first iov for signature check */ 1496 /* set up first iov for signature check */
1521 rdata->iov[0].iov_base = buf; 1497 rdata->iov.iov_base = buf;
1522 rdata->iov[0].iov_len = server->total_read; 1498 rdata->iov.iov_len = server->total_read;
1523 cFYI(1, "0: iov_base=%p iov_len=%zu", 1499 cFYI(1, "0: iov_base=%p iov_len=%zu",
1524 rdata->iov[0].iov_base, rdata->iov[0].iov_len); 1500 rdata->iov.iov_base, rdata->iov.iov_len);
1525 1501
1526 /* how much data is in the response? */ 1502 /* how much data is in the response? */
1527 data_len = server->ops->read_data_length(buf); 1503 data_len = server->ops->read_data_length(buf);
@@ -1531,23 +1507,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1531 return cifs_readv_discard(server, mid); 1507 return cifs_readv_discard(server, mid);
1532 } 1508 }
1533 1509
1534 /* marshal up the page array */ 1510 length = rdata->read_into_pages(server, rdata, data_len);
1535 cifs_kmap_lock(); 1511 if (length < 0)
1536 len = rdata->marshal_iov(rdata, data_len); 1512 return length;
1537 cifs_kmap_unlock();
1538 data_len -= len;
1539
1540 /* issue the read if we have any iovecs left to fill */
1541 if (rdata->nr_iov > 1) {
1542 length = cifs_readv_from_socket(server, &rdata->iov[1],
1543 rdata->nr_iov - 1, len);
1544 if (length < 0)
1545 return length;
1546 server->total_read += length;
1547 } else {
1548 length = 0;
1549 }
1550 1513
1514 server->total_read += length;
1551 rdata->bytes = length; 1515 rdata->bytes = length;
1552 1516
1553 cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read, 1517 cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
@@ -1567,6 +1531,12 @@ cifs_readv_callback(struct mid_q_entry *mid)
1567 struct cifs_readdata *rdata = mid->callback_data; 1531 struct cifs_readdata *rdata = mid->callback_data;
1568 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); 1532 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1569 struct TCP_Server_Info *server = tcon->ses->server; 1533 struct TCP_Server_Info *server = tcon->ses->server;
1534 struct smb_rqst rqst = { .rq_iov = &rdata->iov,
1535 .rq_nvec = 1,
1536 .rq_pages = rdata->pages,
1537 .rq_npages = rdata->nr_pages,
1538 .rq_pagesz = rdata->pagesz,
1539 .rq_tailsz = rdata->tailsz };
1570 1540
1571 cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, 1541 cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
1572 mid->mid, mid->mid_state, rdata->result, rdata->bytes); 1542 mid->mid, mid->mid_state, rdata->result, rdata->bytes);
@@ -1578,9 +1548,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
1578 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 1548 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1579 int rc = 0; 1549 int rc = 0;
1580 1550
1581 rc = cifs_verify_signature(rdata->iov, rdata->nr_iov, 1551 rc = cifs_verify_signature(&rqst, server,
1582 server, 1552 mid->sequence_number + 1);
1583 mid->sequence_number + 1);
1584 if (rc) 1553 if (rc)
1585 cERROR(1, "SMB signature verification returned " 1554 cERROR(1, "SMB signature verification returned "
1586 "error = %d", rc); 1555 "error = %d", rc);
@@ -1610,6 +1579,8 @@ cifs_async_readv(struct cifs_readdata *rdata)
1610 READ_REQ *smb = NULL; 1579 READ_REQ *smb = NULL;
1611 int wct; 1580 int wct;
1612 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); 1581 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1582 struct smb_rqst rqst = { .rq_iov = &rdata->iov,
1583 .rq_nvec = 1 };
1613 1584
1614 cFYI(1, "%s: offset=%llu bytes=%u", __func__, 1585 cFYI(1, "%s: offset=%llu bytes=%u", __func__,
1615 rdata->offset, rdata->bytes); 1586 rdata->offset, rdata->bytes);
@@ -1632,7 +1603,7 @@ cifs_async_readv(struct cifs_readdata *rdata)
1632 smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); 1603 smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
1633 1604
1634 smb->AndXCommand = 0xFF; /* none */ 1605 smb->AndXCommand = 0xFF; /* none */
1635 smb->Fid = rdata->cfile->netfid; 1606 smb->Fid = rdata->cfile->fid.netfid;
1636 smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); 1607 smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
1637 if (wct == 12) 1608 if (wct == 12)
1638 smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); 1609 smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
@@ -1649,13 +1620,12 @@ cifs_async_readv(struct cifs_readdata *rdata)
1649 } 1620 }
1650 1621
1651 /* 4 for RFC1001 length + 1 for BCC */ 1622 /* 4 for RFC1001 length + 1 for BCC */
1652 rdata->iov[0].iov_base = smb; 1623 rdata->iov.iov_base = smb;
1653 rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; 1624 rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
1654 1625
1655 kref_get(&rdata->refcount); 1626 kref_get(&rdata->refcount);
1656 rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, 1627 rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
1657 cifs_readv_receive, cifs_readv_callback, 1628 cifs_readv_callback, rdata, 0);
1658 rdata, 0);
1659 1629
1660 if (rc == 0) 1630 if (rc == 0)
1661 cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); 1631 cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -1926,6 +1896,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
1926{ 1896{
1927 int i, rc; 1897 int i, rc;
1928 struct inode *inode = wdata->cfile->dentry->d_inode; 1898 struct inode *inode = wdata->cfile->dentry->d_inode;
1899 struct TCP_Server_Info *server;
1929 1900
1930 for (i = 0; i < wdata->nr_pages; i++) { 1901 for (i = 0; i < wdata->nr_pages; i++) {
1931 lock_page(wdata->pages[i]); 1902 lock_page(wdata->pages[i]);
@@ -1933,7 +1904,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
1933 } 1904 }
1934 1905
1935 do { 1906 do {
1936 rc = cifs_async_writev(wdata); 1907 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
1908 rc = server->ops->async_writev(wdata);
1937 } while (rc == -EAGAIN); 1909 } while (rc == -EAGAIN);
1938 1910
1939 for (i = 0; i < wdata->nr_pages; i++) { 1911 for (i = 0; i < wdata->nr_pages; i++) {
@@ -2053,11 +2025,12 @@ cifs_writev_callback(struct mid_q_entry *mid)
2053int 2025int
2054cifs_async_writev(struct cifs_writedata *wdata) 2026cifs_async_writev(struct cifs_writedata *wdata)
2055{ 2027{
2056 int i, rc = -EACCES; 2028 int rc = -EACCES;
2057 WRITE_REQ *smb = NULL; 2029 WRITE_REQ *smb = NULL;
2058 int wct; 2030 int wct;
2059 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); 2031 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
2060 struct kvec *iov = NULL; 2032 struct kvec iov;
2033 struct smb_rqst rqst = { };
2061 2034
2062 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 2035 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
2063 wct = 14; 2036 wct = 14;
@@ -2073,18 +2046,11 @@ cifs_async_writev(struct cifs_writedata *wdata)
2073 if (rc) 2046 if (rc)
2074 goto async_writev_out; 2047 goto async_writev_out;
2075 2048
2076 /* 1 iov per page + 1 for header */
2077 iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS);
2078 if (iov == NULL) {
2079 rc = -ENOMEM;
2080 goto async_writev_out;
2081 }
2082
2083 smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); 2049 smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
2084 smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); 2050 smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
2085 2051
2086 smb->AndXCommand = 0xFF; /* none */ 2052 smb->AndXCommand = 0xFF; /* none */
2087 smb->Fid = wdata->cfile->netfid; 2053 smb->Fid = wdata->cfile->fid.netfid;
2088 smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF); 2054 smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
2089 if (wct == 14) 2055 if (wct == 14)
2090 smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32); 2056 smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
@@ -2096,18 +2062,15 @@ cifs_async_writev(struct cifs_writedata *wdata)
2096 cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); 2062 cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
2097 2063
2098 /* 4 for RFC1001 length + 1 for BCC */ 2064 /* 4 for RFC1001 length + 1 for BCC */
2099 iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; 2065 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
2100 iov[0].iov_base = smb; 2066 iov.iov_base = smb;
2101 2067
2102 /* 2068 rqst.rq_iov = &iov;
2103 * This function should marshal up the page array into the kvec 2069 rqst.rq_nvec = 1;
2104 * array, reserving [0] for the header. It should kmap the pages 2070 rqst.rq_pages = wdata->pages;
2105 * and set the iov_len properly for each one. It may also set 2071 rqst.rq_npages = wdata->nr_pages;
2106 * wdata->bytes too. 2072 rqst.rq_pagesz = wdata->pagesz;
2107 */ 2073 rqst.rq_tailsz = wdata->tailsz;
2108 cifs_kmap_lock();
2109 wdata->marshal_iov(iov, wdata);
2110 cifs_kmap_unlock();
2111 2074
2112 cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); 2075 cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
2113 2076
@@ -2123,32 +2086,26 @@ cifs_async_writev(struct cifs_writedata *wdata)
2123 (struct smb_com_writex_req *)smb; 2086 (struct smb_com_writex_req *)smb;
2124 inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5); 2087 inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
2125 put_bcc(wdata->bytes + 5, &smbw->hdr); 2088 put_bcc(wdata->bytes + 5, &smbw->hdr);
2126 iov[0].iov_len += 4; /* pad bigger by four bytes */ 2089 iov.iov_len += 4; /* pad bigger by four bytes */
2127 } 2090 }
2128 2091
2129 kref_get(&wdata->refcount); 2092 kref_get(&wdata->refcount);
2130 rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, 2093 rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
2131 NULL, cifs_writev_callback, wdata, 0); 2094 cifs_writev_callback, wdata, 0);
2132 2095
2133 if (rc == 0) 2096 if (rc == 0)
2134 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); 2097 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
2135 else 2098 else
2136 kref_put(&wdata->refcount, cifs_writedata_release); 2099 kref_put(&wdata->refcount, cifs_writedata_release);
2137 2100
2138 /* send is done, unmap pages */
2139 for (i = 0; i < wdata->nr_pages; i++)
2140 kunmap(wdata->pages[i]);
2141
2142async_writev_out: 2101async_writev_out:
2143 cifs_small_buf_release(smb); 2102 cifs_small_buf_release(smb);
2144 kfree(iov);
2145 return rc; 2103 return rc;
2146} 2104}
2147 2105
2148int 2106int
2149CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, 2107CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
2150 unsigned int *nbytes, struct kvec *iov, int n_vec, 2108 unsigned int *nbytes, struct kvec *iov, int n_vec)
2151 const int long_op)
2152{ 2109{
2153 int rc = -EACCES; 2110 int rc = -EACCES;
2154 WRITE_REQ *pSMB = NULL; 2111 WRITE_REQ *pSMB = NULL;
@@ -2219,8 +2176,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
2219 iov[0].iov_len = smb_hdr_len + 8; 2176 iov[0].iov_len = smb_hdr_len + 8;
2220 2177
2221 2178
2222 rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 2179 rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
2223 long_op);
2224 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); 2180 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
2225 if (rc) { 2181 if (rc) {
2226 cFYI(1, "Send error Write2 = %d", rc); 2182 cFYI(1, "Send error Write2 = %d", rc);
@@ -2557,8 +2513,8 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
2557 2513
2558int 2514int
2559CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, 2515CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
2560 const char *fromName, const char *toName, 2516 const char *from_name, const char *to_name,
2561 const struct nls_table *nls_codepage, int remap) 2517 struct cifs_sb_info *cifs_sb)
2562{ 2518{
2563 int rc = 0; 2519 int rc = 0;
2564 RENAME_REQ *pSMB = NULL; 2520 RENAME_REQ *pSMB = NULL;
@@ -2566,6 +2522,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
2566 int bytes_returned; 2522 int bytes_returned;
2567 int name_len, name_len2; 2523 int name_len, name_len2;
2568 __u16 count; 2524 __u16 count;
2525 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
2569 2526
2570 cFYI(1, "In CIFSSMBRename"); 2527 cFYI(1, "In CIFSSMBRename");
2571renameRetry: 2528renameRetry:
@@ -2580,9 +2537,9 @@ renameRetry:
2580 ATTR_DIRECTORY); 2537 ATTR_DIRECTORY);
2581 2538
2582 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2539 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2583 name_len = 2540 name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
2584 cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, 2541 from_name, PATH_MAX,
2585 PATH_MAX, nls_codepage, remap); 2542 cifs_sb->local_nls, remap);
2586 name_len++; /* trailing null */ 2543 name_len++; /* trailing null */
2587 name_len *= 2; 2544 name_len *= 2;
2588 pSMB->OldFileName[name_len] = 0x04; /* pad */ 2545 pSMB->OldFileName[name_len] = 0x04; /* pad */
@@ -2590,17 +2547,18 @@ renameRetry:
2590 pSMB->OldFileName[name_len + 1] = 0x00; 2547 pSMB->OldFileName[name_len + 1] = 0x00;
2591 name_len2 = 2548 name_len2 =
2592 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], 2549 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
2593 toName, PATH_MAX, nls_codepage, remap); 2550 to_name, PATH_MAX, cifs_sb->local_nls,
2551 remap);
2594 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; 2552 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
2595 name_len2 *= 2; /* convert to bytes */ 2553 name_len2 *= 2; /* convert to bytes */
2596 } else { /* BB improve the check for buffer overruns BB */ 2554 } else { /* BB improve the check for buffer overruns BB */
2597 name_len = strnlen(fromName, PATH_MAX); 2555 name_len = strnlen(from_name, PATH_MAX);
2598 name_len++; /* trailing null */ 2556 name_len++; /* trailing null */
2599 strncpy(pSMB->OldFileName, fromName, name_len); 2557 strncpy(pSMB->OldFileName, from_name, name_len);
2600 name_len2 = strnlen(toName, PATH_MAX); 2558 name_len2 = strnlen(to_name, PATH_MAX);
2601 name_len2++; /* trailing null */ 2559 name_len2++; /* trailing null */
2602 pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ 2560 pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */
2603 strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); 2561 strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
2604 name_len2++; /* trailing null */ 2562 name_len2++; /* trailing null */
2605 name_len2++; /* signature byte */ 2563 name_len2++; /* signature byte */
2606 } 2564 }
@@ -2948,8 +2906,8 @@ createHardLinkRetry:
2948 2906
2949int 2907int
2950CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, 2908CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
2951 const char *fromName, const char *toName, 2909 const char *from_name, const char *to_name,
2952 const struct nls_table *nls_codepage, int remap) 2910 struct cifs_sb_info *cifs_sb)
2953{ 2911{
2954 int rc = 0; 2912 int rc = 0;
2955 NT_RENAME_REQ *pSMB = NULL; 2913 NT_RENAME_REQ *pSMB = NULL;
@@ -2957,6 +2915,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
2957 int bytes_returned; 2915 int bytes_returned;
2958 int name_len, name_len2; 2916 int name_len, name_len2;
2959 __u16 count; 2917 __u16 count;
2918 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
2960 2919
2961 cFYI(1, "In CIFSCreateHardLink"); 2920 cFYI(1, "In CIFSCreateHardLink");
2962winCreateHardLinkRetry: 2921winCreateHardLinkRetry:
@@ -2976,8 +2935,8 @@ winCreateHardLinkRetry:
2976 2935
2977 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2936 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2978 name_len = 2937 name_len =
2979 cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, 2938 cifsConvertToUTF16((__le16 *) pSMB->OldFileName, from_name,
2980 PATH_MAX, nls_codepage, remap); 2939 PATH_MAX, cifs_sb->local_nls, remap);
2981 name_len++; /* trailing null */ 2940 name_len++; /* trailing null */
2982 name_len *= 2; 2941 name_len *= 2;
2983 2942
@@ -2986,17 +2945,18 @@ winCreateHardLinkRetry:
2986 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ 2945 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
2987 name_len2 = 2946 name_len2 =
2988 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], 2947 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
2989 toName, PATH_MAX, nls_codepage, remap); 2948 to_name, PATH_MAX, cifs_sb->local_nls,
2949 remap);
2990 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; 2950 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
2991 name_len2 *= 2; /* convert to bytes */ 2951 name_len2 *= 2; /* convert to bytes */
2992 } else { /* BB improve the check for buffer overruns BB */ 2952 } else { /* BB improve the check for buffer overruns BB */
2993 name_len = strnlen(fromName, PATH_MAX); 2953 name_len = strnlen(from_name, PATH_MAX);
2994 name_len++; /* trailing null */ 2954 name_len++; /* trailing null */
2995 strncpy(pSMB->OldFileName, fromName, name_len); 2955 strncpy(pSMB->OldFileName, from_name, name_len);
2996 name_len2 = strnlen(toName, PATH_MAX); 2956 name_len2 = strnlen(to_name, PATH_MAX);
2997 name_len2++; /* trailing null */ 2957 name_len2++; /* trailing null */
2998 pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ 2958 pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */
2999 strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); 2959 strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
3000 name_len2++; /* trailing null */ 2960 name_len2++; /* trailing null */
3001 name_len2++; /* signature byte */ 2961 name_len2++; /* signature byte */
3002 } 2962 }
@@ -4254,10 +4214,9 @@ UnixQPathInfoRetry:
4254/* xid, tcon, searchName and codepage are input parms, rest are returned */ 4214/* xid, tcon, searchName and codepage are input parms, rest are returned */
4255int 4215int
4256CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, 4216CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
4257 const char *searchName, 4217 const char *searchName, struct cifs_sb_info *cifs_sb,
4258 const struct nls_table *nls_codepage,
4259 __u16 *pnetfid, __u16 search_flags, 4218 __u16 *pnetfid, __u16 search_flags,
4260 struct cifs_search_info *psrch_inf, int remap, const char dirsep) 4219 struct cifs_search_info *psrch_inf, bool msearch)
4261{ 4220{
4262/* level 257 SMB_ */ 4221/* level 257 SMB_ */
4263 TRANSACTION2_FFIRST_REQ *pSMB = NULL; 4222 TRANSACTION2_FFIRST_REQ *pSMB = NULL;
@@ -4265,8 +4224,9 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
4265 T2_FFIRST_RSP_PARMS *parms; 4224 T2_FFIRST_RSP_PARMS *parms;
4266 int rc = 0; 4225 int rc = 0;
4267 int bytes_returned = 0; 4226 int bytes_returned = 0;
4268 int name_len; 4227 int name_len, remap;
4269 __u16 params, byte_count; 4228 __u16 params, byte_count;
4229 struct nls_table *nls_codepage;
4270 4230
4271 cFYI(1, "In FindFirst for %s", searchName); 4231 cFYI(1, "In FindFirst for %s", searchName);
4272 4232
@@ -4276,6 +4236,9 @@ findFirstRetry:
4276 if (rc) 4236 if (rc)
4277 return rc; 4237 return rc;
4278 4238
4239 nls_codepage = cifs_sb->local_nls;
4240 remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
4241
4279 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4242 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4280 name_len = 4243 name_len =
4281 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, 4244 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
@@ -4284,24 +4247,29 @@ findFirstRetry:
4284 it got remapped to 0xF03A as if it were part of the 4247 it got remapped to 0xF03A as if it were part of the
4285 directory name instead of a wildcard */ 4248 directory name instead of a wildcard */
4286 name_len *= 2; 4249 name_len *= 2;
4287 pSMB->FileName[name_len] = dirsep; 4250 if (msearch) {
4288 pSMB->FileName[name_len+1] = 0; 4251 pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
4289 pSMB->FileName[name_len+2] = '*'; 4252 pSMB->FileName[name_len+1] = 0;
4290 pSMB->FileName[name_len+3] = 0; 4253 pSMB->FileName[name_len+2] = '*';
4291 name_len += 4; /* now the trailing null */ 4254 pSMB->FileName[name_len+3] = 0;
4292 pSMB->FileName[name_len] = 0; /* null terminate just in case */ 4255 name_len += 4; /* now the trailing null */
4293 pSMB->FileName[name_len+1] = 0; 4256 /* null terminate just in case */
4294 name_len += 2; 4257 pSMB->FileName[name_len] = 0;
4258 pSMB->FileName[name_len+1] = 0;
4259 name_len += 2;
4260 }
4295 } else { /* BB add check for overrun of SMB buf BB */ 4261 } else { /* BB add check for overrun of SMB buf BB */
4296 name_len = strnlen(searchName, PATH_MAX); 4262 name_len = strnlen(searchName, PATH_MAX);
4297/* BB fix here and in unicode clause above ie 4263/* BB fix here and in unicode clause above ie
4298 if (name_len > buffersize-header) 4264 if (name_len > buffersize-header)
4299 free buffer exit; BB */ 4265 free buffer exit; BB */
4300 strncpy(pSMB->FileName, searchName, name_len); 4266 strncpy(pSMB->FileName, searchName, name_len);
4301 pSMB->FileName[name_len] = dirsep; 4267 if (msearch) {
4302 pSMB->FileName[name_len+1] = '*'; 4268 pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
4303 pSMB->FileName[name_len+2] = 0; 4269 pSMB->FileName[name_len+1] = '*';
4304 name_len += 3; 4270 pSMB->FileName[name_len+2] = 0;
4271 name_len += 3;
4272 }
4305 } 4273 }
4306 4274
4307 params = 12 + name_len /* includes null */ ; 4275 params = 12 + name_len /* includes null */ ;
@@ -4389,7 +4357,8 @@ findFirstRetry:
4389 psrch_inf->last_entry = psrch_inf->srch_entries_start + 4357 psrch_inf->last_entry = psrch_inf->srch_entries_start +
4390 lnoff; 4358 lnoff;
4391 4359
4392 *pnetfid = parms->SearchHandle; 4360 if (pnetfid)
4361 *pnetfid = parms->SearchHandle;
4393 } else { 4362 } else {
4394 cifs_buf_release(pSMB); 4363 cifs_buf_release(pSMB);
4395 } 4364 }
@@ -5417,16 +5386,16 @@ QFSPosixRetry:
5417} 5386}
5418 5387
5419 5388
5420/* We can not use write of zero bytes trick to 5389/*
5421 set file size due to need for large file support. Also note that 5390 * We can not use write of zero bytes trick to set file size due to need for
5422 this SetPathInfo is preferred to SetFileInfo based method in next 5391 * large file support. Also note that this SetPathInfo is preferred to
5423 routine which is only needed to work around a sharing violation bug 5392 * SetFileInfo based method in next routine which is only needed to work around
5424 in Samba which this routine can run into */ 5393 * a sharing violation bugin Samba which this routine can run into.
5425 5394 */
5426int 5395int
5427CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, 5396CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
5428 const char *fileName, __u64 size, bool SetAllocation, 5397 const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb,
5429 const struct nls_table *nls_codepage, int remap) 5398 bool set_allocation)
5430{ 5399{
5431 struct smb_com_transaction2_spi_req *pSMB = NULL; 5400 struct smb_com_transaction2_spi_req *pSMB = NULL;
5432 struct smb_com_transaction2_spi_rsp *pSMBr = NULL; 5401 struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -5434,6 +5403,8 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
5434 int name_len; 5403 int name_len;
5435 int rc = 0; 5404 int rc = 0;
5436 int bytes_returned = 0; 5405 int bytes_returned = 0;
5406 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
5407
5437 __u16 params, byte_count, data_count, param_offset, offset; 5408 __u16 params, byte_count, data_count, param_offset, offset;
5438 5409
5439 cFYI(1, "In SetEOF"); 5410 cFYI(1, "In SetEOF");
@@ -5445,14 +5416,14 @@ SetEOFRetry:
5445 5416
5446 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5417 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5447 name_len = 5418 name_len =
5448 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, 5419 cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
5449 PATH_MAX, nls_codepage, remap); 5420 PATH_MAX, cifs_sb->local_nls, remap);
5450 name_len++; /* trailing null */ 5421 name_len++; /* trailing null */
5451 name_len *= 2; 5422 name_len *= 2;
5452 } else { /* BB improve the check for buffer overruns BB */ 5423 } else { /* BB improve the check for buffer overruns BB */
5453 name_len = strnlen(fileName, PATH_MAX); 5424 name_len = strnlen(file_name, PATH_MAX);
5454 name_len++; /* trailing null */ 5425 name_len++; /* trailing null */
5455 strncpy(pSMB->FileName, fileName, name_len); 5426 strncpy(pSMB->FileName, file_name, name_len);
5456 } 5427 }
5457 params = 6 + name_len; 5428 params = 6 + name_len;
5458 data_count = sizeof(struct file_end_of_file_info); 5429 data_count = sizeof(struct file_end_of_file_info);
@@ -5466,7 +5437,7 @@ SetEOFRetry:
5466 param_offset = offsetof(struct smb_com_transaction2_spi_req, 5437 param_offset = offsetof(struct smb_com_transaction2_spi_req,
5467 InformationLevel) - 4; 5438 InformationLevel) - 4;
5468 offset = param_offset + params; 5439 offset = param_offset + params;
5469 if (SetAllocation) { 5440 if (set_allocation) {
5470 if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) 5441 if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
5471 pSMB->InformationLevel = 5442 pSMB->InformationLevel =
5472 cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); 5443 cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
@@ -5513,8 +5484,8 @@ SetEOFRetry:
5513} 5484}
5514 5485
5515int 5486int
5516CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size, 5487CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
5517 __u16 fid, __u32 pid_of_opener, bool SetAllocation) 5488 struct cifsFileInfo *cfile, __u64 size, bool set_allocation)
5518{ 5489{
5519 struct smb_com_transaction2_sfi_req *pSMB = NULL; 5490 struct smb_com_transaction2_sfi_req *pSMB = NULL;
5520 struct file_end_of_file_info *parm_data; 5491 struct file_end_of_file_info *parm_data;
@@ -5528,8 +5499,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size,
5528 if (rc) 5499 if (rc)
5529 return rc; 5500 return rc;
5530 5501
5531 pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); 5502 pSMB->hdr.Pid = cpu_to_le16((__u16)cfile->pid);
5532 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); 5503 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(cfile->pid >> 16));
5533 5504
5534 params = 6; 5505 params = 6;
5535 pSMB->MaxSetupCount = 0; 5506 pSMB->MaxSetupCount = 0;
@@ -5558,8 +5529,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size,
5558 + offset); 5529 + offset);
5559 pSMB->DataOffset = cpu_to_le16(offset); 5530 pSMB->DataOffset = cpu_to_le16(offset);
5560 parm_data->FileSize = cpu_to_le64(size); 5531 parm_data->FileSize = cpu_to_le64(size);
5561 pSMB->Fid = fid; 5532 pSMB->Fid = cfile->fid.netfid;
5562 if (SetAllocation) { 5533 if (set_allocation) {
5563 if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) 5534 if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
5564 pSMB->InformationLevel = 5535 pSMB->InformationLevel =
5565 cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); 5536 cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6df6fa14cba8..5c670b998ffb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -67,6 +67,7 @@ enum {
67 /* Mount options that take no arguments */ 67 /* Mount options that take no arguments */
68 Opt_user_xattr, Opt_nouser_xattr, 68 Opt_user_xattr, Opt_nouser_xattr,
69 Opt_forceuid, Opt_noforceuid, 69 Opt_forceuid, Opt_noforceuid,
70 Opt_forcegid, Opt_noforcegid,
70 Opt_noblocksend, Opt_noautotune, 71 Opt_noblocksend, Opt_noautotune,
71 Opt_hard, Opt_soft, Opt_perm, Opt_noperm, 72 Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
72 Opt_mapchars, Opt_nomapchars, Opt_sfu, 73 Opt_mapchars, Opt_nomapchars, Opt_sfu,
@@ -82,8 +83,7 @@ enum {
82 Opt_serverino, Opt_noserverino, 83 Opt_serverino, Opt_noserverino,
83 Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl, 84 Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl,
84 Opt_acl, Opt_noacl, Opt_locallease, 85 Opt_acl, Opt_noacl, Opt_locallease,
85 Opt_sign, Opt_seal, Opt_direct, 86 Opt_sign, Opt_seal, Opt_noac,
86 Opt_strictcache, Opt_noac,
87 Opt_fsc, Opt_mfsymlinks, 87 Opt_fsc, Opt_mfsymlinks,
88 Opt_multiuser, Opt_sloppy, 88 Opt_multiuser, Opt_sloppy,
89 89
@@ -118,6 +118,8 @@ static const match_table_t cifs_mount_option_tokens = {
118 { Opt_nouser_xattr, "nouser_xattr" }, 118 { Opt_nouser_xattr, "nouser_xattr" },
119 { Opt_forceuid, "forceuid" }, 119 { Opt_forceuid, "forceuid" },
120 { Opt_noforceuid, "noforceuid" }, 120 { Opt_noforceuid, "noforceuid" },
121 { Opt_forcegid, "forcegid" },
122 { Opt_noforcegid, "noforcegid" },
121 { Opt_noblocksend, "noblocksend" }, 123 { Opt_noblocksend, "noblocksend" },
122 { Opt_noautotune, "noautotune" }, 124 { Opt_noautotune, "noautotune" },
123 { Opt_hard, "hard" }, 125 { Opt_hard, "hard" },
@@ -160,10 +162,6 @@ static const match_table_t cifs_mount_option_tokens = {
160 { Opt_locallease, "locallease" }, 162 { Opt_locallease, "locallease" },
161 { Opt_sign, "sign" }, 163 { Opt_sign, "sign" },
162 { Opt_seal, "seal" }, 164 { Opt_seal, "seal" },
163 { Opt_direct, "direct" },
164 { Opt_direct, "directio" },
165 { Opt_direct, "forcedirectio" },
166 { Opt_strictcache, "strictcache" },
167 { Opt_noac, "noac" }, 165 { Opt_noac, "noac" },
168 { Opt_fsc, "fsc" }, 166 { Opt_fsc, "fsc" },
169 { Opt_mfsymlinks, "mfsymlinks" }, 167 { Opt_mfsymlinks, "mfsymlinks" },
@@ -277,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
277static const match_table_t cifs_smb_version_tokens = { 275static const match_table_t cifs_smb_version_tokens = {
278 { Smb_1, SMB1_VERSION_STRING }, 276 { Smb_1, SMB1_VERSION_STRING },
279 { Smb_21, SMB21_VERSION_STRING }, 277 { Smb_21, SMB21_VERSION_STRING },
278 { Smb_30, SMB30_VERSION_STRING },
280}; 279};
281 280
282static int ip_connect(struct TCP_Server_Info *server); 281static int ip_connect(struct TCP_Server_Info *server);
@@ -819,6 +818,10 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
819 cifs_dump_mem("Bad SMB: ", buf, 818 cifs_dump_mem("Bad SMB: ", buf,
820 min_t(unsigned int, server->total_read, 48)); 819 min_t(unsigned int, server->total_read, 48));
821 820
821 if (server->ops->is_status_pending &&
822 server->ops->is_status_pending(buf, server, length))
823 return -1;
824
822 if (!mid) 825 if (!mid)
823 return length; 826 return length;
824 827
@@ -1075,6 +1078,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1075 vol->ops = &smb21_operations; 1078 vol->ops = &smb21_operations;
1076 vol->vals = &smb21_values; 1079 vol->vals = &smb21_values;
1077 break; 1080 break;
1081 case Smb_30:
1082 vol->ops = &smb21_operations; /* currently identical with 2.1 */
1083 vol->vals = &smb30_values;
1084 break;
1078#endif 1085#endif
1079 default: 1086 default:
1080 cERROR(1, "Unknown vers= option specified: %s", value); 1087 cERROR(1, "Unknown vers= option specified: %s", value);
@@ -1101,8 +1108,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1101 char *string = NULL; 1108 char *string = NULL;
1102 char *tmp_end, *value; 1109 char *tmp_end, *value;
1103 char delim; 1110 char delim;
1104 bool cache_specified = false;
1105 static bool cache_warned = false;
1106 1111
1107 separator[0] = ','; 1112 separator[0] = ',';
1108 separator[1] = 0; 1113 separator[1] = 0;
@@ -1134,6 +1139,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1134 /* default to using server inode numbers where available */ 1139 /* default to using server inode numbers where available */
1135 vol->server_ino = 1; 1140 vol->server_ino = 1;
1136 1141
1142 /* default is to use strict cifs caching semantics */
1143 vol->strict_io = true;
1144
1137 vol->actimeo = CIFS_DEF_ACTIMEO; 1145 vol->actimeo = CIFS_DEF_ACTIMEO;
1138 1146
1139 /* FIXME: add autonegotiation -- for now, SMB1 is default */ 1147 /* FIXME: add autonegotiation -- for now, SMB1 is default */
@@ -1190,6 +1198,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1190 case Opt_noforceuid: 1198 case Opt_noforceuid:
1191 override_uid = 0; 1199 override_uid = 0;
1192 break; 1200 break;
1201 case Opt_forcegid:
1202 override_gid = 1;
1203 break;
1204 case Opt_noforcegid:
1205 override_gid = 0;
1206 break;
1193 case Opt_noblocksend: 1207 case Opt_noblocksend:
1194 vol->noblocksnd = 1; 1208 vol->noblocksnd = 1;
1195 break; 1209 break;
@@ -1317,22 +1331,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1317 */ 1331 */
1318 vol->seal = 1; 1332 vol->seal = 1;
1319 break; 1333 break;
1320 case Opt_direct:
1321 cache_specified = true;
1322 vol->direct_io = true;
1323 vol->strict_io = false;
1324 cERROR(1, "The \"directio\" option will be removed in "
1325 "3.7. Please switch to the \"cache=none\" "
1326 "option.");
1327 break;
1328 case Opt_strictcache:
1329 cache_specified = true;
1330 vol->direct_io = false;
1331 vol->strict_io = true;
1332 cERROR(1, "The \"strictcache\" option will be removed "
1333 "in 3.7. Please switch to the \"cache=strict\" "
1334 "option.");
1335 break;
1336 case Opt_noac: 1334 case Opt_noac:
1337 printk(KERN_WARNING "CIFS: Mount option noac not " 1335 printk(KERN_WARNING "CIFS: Mount option noac not "
1338 "supported. Instead set " 1336 "supported. Instead set "
@@ -1676,8 +1674,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1676 if (string == NULL) 1674 if (string == NULL)
1677 goto out_nomem; 1675 goto out_nomem;
1678 1676
1679 if (strnicmp(string, "TCP_NODELAY", 11) == 0) 1677 if (strnicmp(string, "TCP_NODELAY", 11) == 0) {
1678 printk(KERN_WARNING "CIFS: the "
1679 "sockopt=TCP_NODELAY option has been "
1680 "deprecated and will be removed "
1681 "in 3.9\n");
1680 vol->sockopt_tcp_nodelay = 1; 1682 vol->sockopt_tcp_nodelay = 1;
1683 }
1681 break; 1684 break;
1682 case Opt_netbiosname: 1685 case Opt_netbiosname:
1683 string = match_strdup(args); 1686 string = match_strdup(args);
@@ -1762,7 +1765,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1762 goto cifs_parse_mount_err; 1765 goto cifs_parse_mount_err;
1763 break; 1766 break;
1764 case Opt_cache: 1767 case Opt_cache:
1765 cache_specified = true;
1766 string = match_strdup(args); 1768 string = match_strdup(args);
1767 if (string == NULL) 1769 if (string == NULL)
1768 goto out_nomem; 1770 goto out_nomem;
@@ -1813,14 +1815,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1813 printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " 1815 printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
1814 "specified with no gid= option.\n"); 1816 "specified with no gid= option.\n");
1815 1817
1816 /* FIXME: remove this block in 3.7 */
1817 if (!cache_specified && !cache_warned) {
1818 cache_warned = true;
1819 printk(KERN_NOTICE "CIFS: no cache= option specified, using "
1820 "\"cache=loose\". This default will change "
1821 "to \"cache=strict\" in 3.7.\n");
1822 }
1823
1824 kfree(mountdata_copy); 1818 kfree(mountdata_copy);
1825 return 0; 1819 return 0;
1826 1820
@@ -2636,6 +2630,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
2636 tcon->retry = volume_info->retry; 2630 tcon->retry = volume_info->retry;
2637 tcon->nocase = volume_info->nocase; 2631 tcon->nocase = volume_info->nocase;
2638 tcon->local_lease = volume_info->local_lease; 2632 tcon->local_lease = volume_info->local_lease;
2633 INIT_LIST_HEAD(&tcon->pending_opens);
2639 2634
2640 spin_lock(&cifs_tcp_ses_lock); 2635 spin_lock(&cifs_tcp_ses_lock);
2641 list_add(&tcon->tcon_list, &ses->tcon_list); 2636 list_add(&tcon->tcon_list, &ses->tcon_list);
@@ -3261,146 +3256,6 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
3261 "mount option supported"); 3256 "mount option supported");
3262} 3257}
3263 3258
3264/*
3265 * When the server supports very large reads and writes via POSIX extensions,
3266 * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
3267 * including the RFC1001 length.
3268 *
3269 * Note that this might make for "interesting" allocation problems during
3270 * writeback however as we have to allocate an array of pointers for the
3271 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
3272 *
3273 * For reads, there is a similar problem as we need to allocate an array
3274 * of kvecs to handle the receive, though that should only need to be done
3275 * once.
3276 */
3277#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
3278#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
3279
3280/*
3281 * When the server doesn't allow large posix writes, only allow a rsize/wsize
3282 * of 2^17-1 minus the size of the call header. That allows for a read or
3283 * write up to the maximum size described by RFC1002.
3284 */
3285#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
3286#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
3287
3288/*
3289 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
3290 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
3291 * a single wsize request with a single call.
3292 */
3293#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
3294
3295/*
3296 * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
3297 * those values when posix extensions aren't in force. In actuality here, we
3298 * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
3299 * to be ok with the extra byte even though Windows doesn't send writes that
3300 * are that large.
3301 *
3302 * Citation:
3303 *
3304 * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
3305 */
3306#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
3307#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
3308
3309/*
3310 * On hosts with high memory, we can't currently support wsize/rsize that are
3311 * larger than we can kmap at once. Cap the rsize/wsize at
3312 * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request
3313 * larger than that anyway.
3314 */
3315#ifdef CONFIG_HIGHMEM
3316#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE)
3317#else /* CONFIG_HIGHMEM */
3318#define CIFS_KMAP_SIZE_LIMIT (1<<24)
3319#endif /* CONFIG_HIGHMEM */
3320
3321static unsigned int
3322cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
3323{
3324 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
3325 struct TCP_Server_Info *server = tcon->ses->server;
3326 unsigned int wsize;
3327
3328 /* start with specified wsize, or default */
3329 if (pvolume_info->wsize)
3330 wsize = pvolume_info->wsize;
3331 else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
3332 wsize = CIFS_DEFAULT_IOSIZE;
3333 else
3334 wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
3335
3336 /* can server support 24-bit write sizes? (via UNIX extensions) */
3337 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
3338 wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
3339
3340 /*
3341 * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
3342 * Limit it to max buffer offered by the server, minus the size of the
3343 * WRITEX header, not including the 4 byte RFC1001 length.
3344 */
3345 if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
3346 (!(server->capabilities & CAP_UNIX) &&
3347 (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
3348 wsize = min_t(unsigned int, wsize,
3349 server->maxBuf - sizeof(WRITE_REQ) + 4);
3350
3351 /* limit to the amount that we can kmap at once */
3352 wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT);
3353
3354 /* hard limit of CIFS_MAX_WSIZE */
3355 wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
3356
3357 return wsize;
3358}
3359
3360static unsigned int
3361cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
3362{
3363 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
3364 struct TCP_Server_Info *server = tcon->ses->server;
3365 unsigned int rsize, defsize;
3366
3367 /*
3368 * Set default value...
3369 *
3370 * HACK alert! Ancient servers have very small buffers. Even though
3371 * MS-CIFS indicates that servers are only limited by the client's
3372 * bufsize for reads, testing against win98se shows that it throws
3373 * INVALID_PARAMETER errors if you try to request too large a read.
3374 * OS/2 just sends back short reads.
3375 *
3376 * If the server doesn't advertise CAP_LARGE_READ_X, then assume that
3377 * it can't handle a read request larger than its MaxBufferSize either.
3378 */
3379 if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
3380 defsize = CIFS_DEFAULT_IOSIZE;
3381 else if (server->capabilities & CAP_LARGE_READ_X)
3382 defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
3383 else
3384 defsize = server->maxBuf - sizeof(READ_RSP);
3385
3386 rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
3387
3388 /*
3389 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
3390 * the client's MaxBufferSize.
3391 */
3392 if (!(server->capabilities & CAP_LARGE_READ_X))
3393 rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
3394
3395 /* limit to the amount that we can kmap at once */
3396 rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT);
3397
3398 /* hard limit of CIFS_MAX_RSIZE */
3399 rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
3400
3401 return rsize;
3402}
3403
3404static void 3259static void
3405cleanup_volume_info_contents(struct smb_vol *volume_info) 3260cleanup_volume_info_contents(struct smb_vol *volume_info)
3406{ 3261{
@@ -3651,8 +3506,8 @@ try_mount_again:
3651 if (!tcon->ipc && server->ops->qfs_tcon) 3506 if (!tcon->ipc && server->ops->qfs_tcon)
3652 server->ops->qfs_tcon(xid, tcon); 3507 server->ops->qfs_tcon(xid, tcon);
3653 3508
3654 cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); 3509 cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
3655 cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); 3510 cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
3656 3511
3657 /* tune readahead according to rsize */ 3512 /* tune readahead according to rsize */
3658 cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; 3513 cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 781025be48bc..7c0a81283645 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -160,17 +160,18 @@ check_name(struct dentry *direntry)
160static int 160static int
161cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, 161cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
162 struct tcon_link *tlink, unsigned oflags, umode_t mode, 162 struct tcon_link *tlink, unsigned oflags, umode_t mode,
163 __u32 *oplock, __u16 *fileHandle, int *created) 163 __u32 *oplock, struct cifs_fid *fid, int *created)
164{ 164{
165 int rc = -ENOENT; 165 int rc = -ENOENT;
166 int create_options = CREATE_NOT_DIR; 166 int create_options = CREATE_NOT_DIR;
167 int desiredAccess; 167 int desired_access;
168 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 168 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
169 struct cifs_tcon *tcon = tlink_tcon(tlink); 169 struct cifs_tcon *tcon = tlink_tcon(tlink);
170 char *full_path = NULL; 170 char *full_path = NULL;
171 FILE_ALL_INFO *buf = NULL; 171 FILE_ALL_INFO *buf = NULL;
172 struct inode *newinode = NULL; 172 struct inode *newinode = NULL;
173 int disposition; 173 int disposition;
174 struct TCP_Server_Info *server = tcon->ses->server;
174 175
175 *oplock = 0; 176 *oplock = 0;
176 if (tcon->ses->server->oplocks) 177 if (tcon->ses->server->oplocks)
@@ -185,8 +186,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
185 if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && 186 if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
186 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 187 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
187 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 188 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
188 rc = cifs_posix_open(full_path, &newinode, 189 rc = cifs_posix_open(full_path, &newinode, inode->i_sb, mode,
189 inode->i_sb, mode, oflags, oplock, fileHandle, xid); 190 oflags, oplock, &fid->netfid, xid);
190 switch (rc) { 191 switch (rc) {
191 case 0: 192 case 0:
192 if (newinode == NULL) { 193 if (newinode == NULL) {
@@ -202,7 +203,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
202 * close it and proceed as if it were a normal 203 * close it and proceed as if it were a normal
203 * lookup. 204 * lookup.
204 */ 205 */
205 CIFSSMBClose(xid, tcon, *fileHandle); 206 CIFSSMBClose(xid, tcon, fid->netfid);
206 goto cifs_create_get_file_info; 207 goto cifs_create_get_file_info;
207 } 208 }
208 /* success, no need to query */ 209 /* success, no need to query */
@@ -244,11 +245,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
244 */ 245 */
245 } 246 }
246 247
247 desiredAccess = 0; 248 desired_access = 0;
248 if (OPEN_FMODE(oflags) & FMODE_READ) 249 if (OPEN_FMODE(oflags) & FMODE_READ)
249 desiredAccess |= GENERIC_READ; /* is this too little? */ 250 desired_access |= GENERIC_READ; /* is this too little? */
250 if (OPEN_FMODE(oflags) & FMODE_WRITE) 251 if (OPEN_FMODE(oflags) & FMODE_WRITE)
251 desiredAccess |= GENERIC_WRITE; 252 desired_access |= GENERIC_WRITE;
252 253
253 disposition = FILE_OVERWRITE_IF; 254 disposition = FILE_OVERWRITE_IF;
254 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 255 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -260,8 +261,15 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
260 else 261 else
261 cFYI(1, "Create flag not set in create function"); 262 cFYI(1, "Create flag not set in create function");
262 263
263 /* BB add processing to set equivalent of mode - e.g. via CreateX with 264 /*
264 ACLs */ 265 * BB add processing to set equivalent of mode - e.g. via CreateX with
266 * ACLs
267 */
268
269 if (!server->ops->open) {
270 rc = -ENOSYS;
271 goto out;
272 }
265 273
266 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 274 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
267 if (buf == NULL) { 275 if (buf == NULL) {
@@ -279,28 +287,18 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
279 if (backup_cred(cifs_sb)) 287 if (backup_cred(cifs_sb))
280 create_options |= CREATE_OPEN_BACKUP_INTENT; 288 create_options |= CREATE_OPEN_BACKUP_INTENT;
281 289
282 if (tcon->ses->capabilities & CAP_NT_SMBS) 290 rc = server->ops->open(xid, tcon, full_path, disposition,
283 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 291 desired_access, create_options, fid, oplock,
284 desiredAccess, create_options, 292 buf, cifs_sb);
285 fileHandle, oplock, buf, cifs_sb->local_nls,
286 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
287 else
288 rc = -EIO; /* no NT SMB support fall into legacy open below */
289
290 if (rc == -EIO) {
291 /* old server, retry the open legacy style */
292 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
293 desiredAccess, create_options,
294 fileHandle, oplock, buf, cifs_sb->local_nls,
295 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
296 }
297 if (rc) { 293 if (rc) {
298 cFYI(1, "cifs_create returned 0x%x", rc); 294 cFYI(1, "cifs_create returned 0x%x", rc);
299 goto out; 295 goto out;
300 } 296 }
301 297
302 /* If Open reported that we actually created a file 298 /*
303 then we now have to set the mode if possible */ 299 * If Open reported that we actually created a file then we now have to
300 * set the mode if possible.
301 */
304 if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) { 302 if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) {
305 struct cifs_unix_set_info_args args = { 303 struct cifs_unix_set_info_args args = {
306 .mode = mode, 304 .mode = mode,
@@ -321,11 +319,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
321 args.uid = NO_CHANGE_64; 319 args.uid = NO_CHANGE_64;
322 args.gid = NO_CHANGE_64; 320 args.gid = NO_CHANGE_64;
323 } 321 }
324 CIFSSMBUnixSetFileInfo(xid, tcon, &args, *fileHandle, 322 CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
325 current->tgid); 323 current->tgid);
326 } else { 324 } else {
327 /* BB implement mode setting via Windows security 325 /*
328 descriptors e.g. */ 326 * BB implement mode setting via Windows security
327 * descriptors e.g.
328 */
329 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ 329 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
330 330
331 /* Could set r/o dos attribute if mode & 0222 == 0 */ 331 /* Could set r/o dos attribute if mode & 0222 == 0 */
@@ -334,12 +334,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
334cifs_create_get_file_info: 334cifs_create_get_file_info:
335 /* server might mask mode so we have to query for it */ 335 /* server might mask mode so we have to query for it */
336 if (tcon->unix_ext) 336 if (tcon->unix_ext)
337 rc = cifs_get_inode_info_unix(&newinode, full_path, 337 rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
338 inode->i_sb, xid); 338 xid);
339 else { 339 else {
340 rc = cifs_get_inode_info(&newinode, full_path, buf, 340 rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
341 inode->i_sb, xid, fileHandle); 341 xid, &fid->netfid);
342 if (newinode) { 342 if (newinode) {
343 if (server->ops->set_lease_key)
344 server->ops->set_lease_key(newinode, fid);
343 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) 345 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
344 newinode->i_mode = mode; 346 newinode->i_mode = mode;
345 if ((*oplock & CIFS_CREATE_ACTION) && 347 if ((*oplock & CIFS_CREATE_ACTION) &&
@@ -356,7 +358,8 @@ cifs_create_get_file_info:
356cifs_create_set_dentry: 358cifs_create_set_dentry:
357 if (rc != 0) { 359 if (rc != 0) {
358 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); 360 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
359 CIFSSMBClose(xid, tcon, *fileHandle); 361 if (server->ops->close)
362 server->ops->close(xid, tcon, fid);
360 goto out; 363 goto out;
361 } 364 }
362 d_drop(direntry); 365 d_drop(direntry);
@@ -377,11 +380,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
377 unsigned int xid; 380 unsigned int xid;
378 struct tcon_link *tlink; 381 struct tcon_link *tlink;
379 struct cifs_tcon *tcon; 382 struct cifs_tcon *tcon;
380 __u16 fileHandle; 383 struct TCP_Server_Info *server;
384 struct cifs_fid fid;
385 struct cifs_pending_open open;
381 __u32 oplock; 386 __u32 oplock;
382 struct cifsFileInfo *pfile_info; 387 struct cifsFileInfo *file_info;
383 388
384 /* Posix open is only called (at lookup time) for file create now. For 389 /*
390 * Posix open is only called (at lookup time) for file create now. For
385 * opens (rather than creates), because we do not know if it is a file 391 * opens (rather than creates), because we do not know if it is a file
386 * or directory yet, and current Samba no longer allows us to do posix 392 * or directory yet, and current Samba no longer allows us to do posix
387 * open on dirs, we could end up wasting an open call on what turns out 393 * open on dirs, we could end up wasting an open call on what turns out
@@ -413,22 +419,34 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
413 goto out_free_xid; 419 goto out_free_xid;
414 420
415 tcon = tlink_tcon(tlink); 421 tcon = tlink_tcon(tlink);
422 server = tcon->ses->server;
423
424 if (server->ops->new_lease_key)
425 server->ops->new_lease_key(&fid);
426
427 cifs_add_pending_open(&fid, tlink, &open);
416 428
417 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, 429 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
418 &oplock, &fileHandle, opened); 430 &oplock, &fid, opened);
419 431
420 if (rc) 432 if (rc) {
433 cifs_del_pending_open(&open);
421 goto out; 434 goto out;
435 }
422 436
423 rc = finish_open(file, direntry, generic_file_open, opened); 437 rc = finish_open(file, direntry, generic_file_open, opened);
424 if (rc) { 438 if (rc) {
425 CIFSSMBClose(xid, tcon, fileHandle); 439 if (server->ops->close)
440 server->ops->close(xid, tcon, &fid);
441 cifs_del_pending_open(&open);
426 goto out; 442 goto out;
427 } 443 }
428 444
429 pfile_info = cifs_new_fileinfo(fileHandle, file, tlink, oplock); 445 file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
430 if (pfile_info == NULL) { 446 if (file_info == NULL) {
431 CIFSSMBClose(xid, tcon, fileHandle); 447 if (server->ops->close)
448 server->ops->close(xid, tcon, &fid);
449 cifs_del_pending_open(&open);
432 rc = -ENOMEM; 450 rc = -ENOMEM;
433 } 451 }
434 452
@@ -453,7 +471,9 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
453 */ 471 */
454 unsigned oflags = O_EXCL | O_CREAT | O_RDWR; 472 unsigned oflags = O_EXCL | O_CREAT | O_RDWR;
455 struct tcon_link *tlink; 473 struct tcon_link *tlink;
456 __u16 fileHandle; 474 struct cifs_tcon *tcon;
475 struct TCP_Server_Info *server;
476 struct cifs_fid fid;
457 __u32 oplock; 477 __u32 oplock;
458 int created = FILE_CREATED; 478 int created = FILE_CREATED;
459 479
@@ -465,10 +485,16 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
465 if (IS_ERR(tlink)) 485 if (IS_ERR(tlink))
466 goto out_free_xid; 486 goto out_free_xid;
467 487
488 tcon = tlink_tcon(tlink);
489 server = tcon->ses->server;
490
491 if (server->ops->new_lease_key)
492 server->ops->new_lease_key(&fid);
493
468 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, 494 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
469 &oplock, &fileHandle, &created); 495 &oplock, &fid, &created);
470 if (!rc) 496 if (!rc && server->ops->close)
471 CIFSSMBClose(xid, tlink_tcon(tlink), fileHandle); 497 server->ops->close(xid, tcon, &fid);
472 498
473 cifs_put_tlink(tlink); 499 cifs_put_tlink(tlink);
474out_free_xid: 500out_free_xid:
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9154192b0683..edb25b4bbb95 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -169,16 +169,20 @@ posix_open_ret:
169 169
170static int 170static int
171cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, 171cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
172 struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock, 172 struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
173 __u16 *pnetfid, unsigned int xid) 173 struct cifs_fid *fid, unsigned int xid)
174{ 174{
175 int rc; 175 int rc;
176 int desiredAccess; 176 int desired_access;
177 int disposition; 177 int disposition;
178 int create_options = CREATE_NOT_DIR; 178 int create_options = CREATE_NOT_DIR;
179 FILE_ALL_INFO *buf; 179 FILE_ALL_INFO *buf;
180 struct TCP_Server_Info *server = tcon->ses->server;
181
182 if (!server->ops->open)
183 return -ENOSYS;
180 184
181 desiredAccess = cifs_convert_flags(f_flags); 185 desired_access = cifs_convert_flags(f_flags);
182 186
183/********************************************************************* 187/*********************************************************************
184 * open flag mapping table: 188 * open flag mapping table:
@@ -215,16 +219,9 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
215 if (backup_cred(cifs_sb)) 219 if (backup_cred(cifs_sb))
216 create_options |= CREATE_OPEN_BACKUP_INTENT; 220 create_options |= CREATE_OPEN_BACKUP_INTENT;
217 221
218 if (tcon->ses->capabilities & CAP_NT_SMBS) 222 rc = server->ops->open(xid, tcon, full_path, disposition,
219 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 223 desired_access, create_options, fid, oplock, buf,
220 desiredAccess, create_options, pnetfid, poplock, buf, 224 cifs_sb);
221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
223 else
224 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
225 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
226 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
227 & CIFS_MOUNT_MAP_SPECIAL_CHR);
228 225
229 if (rc) 226 if (rc)
230 goto out; 227 goto out;
@@ -234,7 +231,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
234 xid); 231 xid);
235 else 232 else
236 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 233 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
237 xid, pnetfid); 234 xid, &fid->netfid);
238 235
239out: 236out:
240 kfree(buf); 237 kfree(buf);
@@ -242,48 +239,62 @@ out:
242} 239}
243 240
244struct cifsFileInfo * 241struct cifsFileInfo *
245cifs_new_fileinfo(__u16 fileHandle, struct file *file, 242cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
246 struct tcon_link *tlink, __u32 oplock) 243 struct tcon_link *tlink, __u32 oplock)
247{ 244{
248 struct dentry *dentry = file->f_path.dentry; 245 struct dentry *dentry = file->f_path.dentry;
249 struct inode *inode = dentry->d_inode; 246 struct inode *inode = dentry->d_inode;
250 struct cifsInodeInfo *pCifsInode = CIFS_I(inode); 247 struct cifsInodeInfo *cinode = CIFS_I(inode);
251 struct cifsFileInfo *pCifsFile; 248 struct cifsFileInfo *cfile;
252 249 struct cifs_fid_locks *fdlocks;
253 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 250 struct cifs_tcon *tcon = tlink_tcon(tlink);
254 if (pCifsFile == NULL) 251
255 return pCifsFile; 252 cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
256 253 if (cfile == NULL)
257 pCifsFile->count = 1; 254 return cfile;
258 pCifsFile->netfid = fileHandle; 255
259 pCifsFile->pid = current->tgid; 256 fdlocks = kzalloc(sizeof(struct cifs_fid_locks), GFP_KERNEL);
260 pCifsFile->uid = current_fsuid(); 257 if (!fdlocks) {
261 pCifsFile->dentry = dget(dentry); 258 kfree(cfile);
262 pCifsFile->f_flags = file->f_flags; 259 return NULL;
263 pCifsFile->invalidHandle = false; 260 }
264 pCifsFile->tlink = cifs_get_tlink(tlink); 261
265 mutex_init(&pCifsFile->fh_mutex); 262 INIT_LIST_HEAD(&fdlocks->locks);
266 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); 263 fdlocks->cfile = cfile;
267 INIT_LIST_HEAD(&pCifsFile->llist); 264 cfile->llist = fdlocks;
265 down_write(&cinode->lock_sem);
266 list_add(&fdlocks->llist, &cinode->llist);
267 up_write(&cinode->lock_sem);
268
269 cfile->count = 1;
270 cfile->pid = current->tgid;
271 cfile->uid = current_fsuid();
272 cfile->dentry = dget(dentry);
273 cfile->f_flags = file->f_flags;
274 cfile->invalidHandle = false;
275 cfile->tlink = cifs_get_tlink(tlink);
276 INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
277 mutex_init(&cfile->fh_mutex);
268 278
269 spin_lock(&cifs_file_list_lock); 279 spin_lock(&cifs_file_list_lock);
270 list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList)); 280 if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE)
281 oplock = fid->pending_open->oplock;
282 list_del(&fid->pending_open->olist);
283
284 tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock);
285
286 list_add(&cfile->tlist, &tcon->openFileList);
271 /* if readable file instance put first in list*/ 287 /* if readable file instance put first in list*/
272 if (file->f_mode & FMODE_READ) 288 if (file->f_mode & FMODE_READ)
273 list_add(&pCifsFile->flist, &pCifsInode->openFileList); 289 list_add(&cfile->flist, &cinode->openFileList);
274 else 290 else
275 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); 291 list_add_tail(&cfile->flist, &cinode->openFileList);
276 spin_unlock(&cifs_file_list_lock); 292 spin_unlock(&cifs_file_list_lock);
277 293
278 cifs_set_oplock_level(pCifsInode, oplock); 294 file->private_data = cfile;
279 pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; 295 return cfile;
280
281 file->private_data = pCifsFile;
282 return pCifsFile;
283} 296}
284 297
285static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
286
287struct cifsFileInfo * 298struct cifsFileInfo *
288cifsFileInfo_get(struct cifsFileInfo *cifs_file) 299cifsFileInfo_get(struct cifsFileInfo *cifs_file)
289{ 300{
@@ -302,9 +313,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
302{ 313{
303 struct inode *inode = cifs_file->dentry->d_inode; 314 struct inode *inode = cifs_file->dentry->d_inode;
304 struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); 315 struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
316 struct TCP_Server_Info *server = tcon->ses->server;
305 struct cifsInodeInfo *cifsi = CIFS_I(inode); 317 struct cifsInodeInfo *cifsi = CIFS_I(inode);
306 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 318 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
307 struct cifsLockInfo *li, *tmp; 319 struct cifsLockInfo *li, *tmp;
320 struct cifs_fid fid;
321 struct cifs_pending_open open;
308 322
309 spin_lock(&cifs_file_list_lock); 323 spin_lock(&cifs_file_list_lock);
310 if (--cifs_file->count > 0) { 324 if (--cifs_file->count > 0) {
@@ -312,6 +326,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
312 return; 326 return;
313 } 327 }
314 328
329 if (server->ops->get_lease_key)
330 server->ops->get_lease_key(inode, &fid);
331
332 /* store open in pending opens to make sure we don't miss lease break */
333 cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open);
334
315 /* remove it from the lists */ 335 /* remove it from the lists */
316 list_del(&cifs_file->flist); 336 list_del(&cifs_file->flist);
317 list_del(&cifs_file->tlist); 337 list_del(&cifs_file->tlist);
@@ -319,13 +339,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
319 if (list_empty(&cifsi->openFileList)) { 339 if (list_empty(&cifsi->openFileList)) {
320 cFYI(1, "closing last open instance for inode %p", 340 cFYI(1, "closing last open instance for inode %p",
321 cifs_file->dentry->d_inode); 341 cifs_file->dentry->d_inode);
322 342 /*
323 /* in strict cache mode we need invalidate mapping on the last 343 * In strict cache mode we need invalidate mapping on the last
324 close because it may cause a error when we open this file 344 * close because it may cause a error when we open this file
325 again and get at least level II oplock */ 345 * again and get at least level II oplock.
346 */
326 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) 347 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
327 CIFS_I(inode)->invalid_mapping = true; 348 CIFS_I(inode)->invalid_mapping = true;
328
329 cifs_set_oplock_level(cifsi, 0); 349 cifs_set_oplock_level(cifsi, 0);
330 } 350 }
331 spin_unlock(&cifs_file_list_lock); 351 spin_unlock(&cifs_file_list_lock);
@@ -333,23 +353,30 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
333 cancel_work_sync(&cifs_file->oplock_break); 353 cancel_work_sync(&cifs_file->oplock_break);
334 354
335 if (!tcon->need_reconnect && !cifs_file->invalidHandle) { 355 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
356 struct TCP_Server_Info *server = tcon->ses->server;
336 unsigned int xid; 357 unsigned int xid;
337 int rc; 358
338 xid = get_xid(); 359 xid = get_xid();
339 rc = CIFSSMBClose(xid, tcon, cifs_file->netfid); 360 if (server->ops->close)
340 free_xid(xid); 361 server->ops->close(xid, tcon, &cifs_file->fid);
362 _free_xid(xid);
341 } 363 }
342 364
343 /* Delete any outstanding lock records. We'll lose them when the file 365 cifs_del_pending_open(&open);
366
367 /*
368 * Delete any outstanding lock records. We'll lose them when the file
344 * is closed anyway. 369 * is closed anyway.
345 */ 370 */
346 mutex_lock(&cifsi->lock_mutex); 371 down_write(&cifsi->lock_sem);
347 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { 372 list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
348 list_del(&li->llist); 373 list_del(&li->llist);
349 cifs_del_lock_waiters(li); 374 cifs_del_lock_waiters(li);
350 kfree(li); 375 kfree(li);
351 } 376 }
352 mutex_unlock(&cifsi->lock_mutex); 377 list_del(&cifs_file->llist->llist);
378 kfree(cifs_file->llist);
379 up_write(&cifsi->lock_sem);
353 380
354 cifs_put_tlink(cifs_file->tlink); 381 cifs_put_tlink(cifs_file->tlink);
355 dput(cifs_file->dentry); 382 dput(cifs_file->dentry);
@@ -357,17 +384,20 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
357} 384}
358 385
359int cifs_open(struct inode *inode, struct file *file) 386int cifs_open(struct inode *inode, struct file *file)
387
360{ 388{
361 int rc = -EACCES; 389 int rc = -EACCES;
362 unsigned int xid; 390 unsigned int xid;
363 __u32 oplock; 391 __u32 oplock;
364 struct cifs_sb_info *cifs_sb; 392 struct cifs_sb_info *cifs_sb;
393 struct TCP_Server_Info *server;
365 struct cifs_tcon *tcon; 394 struct cifs_tcon *tcon;
366 struct tcon_link *tlink; 395 struct tcon_link *tlink;
367 struct cifsFileInfo *pCifsFile = NULL; 396 struct cifsFileInfo *cfile = NULL;
368 char *full_path = NULL; 397 char *full_path = NULL;
369 bool posix_open_ok = false; 398 bool posix_open_ok = false;
370 __u16 netfid; 399 struct cifs_fid fid;
400 struct cifs_pending_open open;
371 401
372 xid = get_xid(); 402 xid = get_xid();
373 403
@@ -378,6 +408,7 @@ int cifs_open(struct inode *inode, struct file *file)
378 return PTR_ERR(tlink); 408 return PTR_ERR(tlink);
379 } 409 }
380 tcon = tlink_tcon(tlink); 410 tcon = tlink_tcon(tlink);
411 server = tcon->ses->server;
381 412
382 full_path = build_path_from_dentry(file->f_path.dentry); 413 full_path = build_path_from_dentry(file->f_path.dentry);
383 if (full_path == NULL) { 414 if (full_path == NULL) {
@@ -388,7 +419,7 @@ int cifs_open(struct inode *inode, struct file *file)
388 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 419 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
389 inode, file->f_flags, full_path); 420 inode, file->f_flags, full_path);
390 421
391 if (tcon->ses->server->oplocks) 422 if (server->oplocks)
392 oplock = REQ_OPLOCK; 423 oplock = REQ_OPLOCK;
393 else 424 else
394 oplock = 0; 425 oplock = 0;
@@ -399,7 +430,7 @@ int cifs_open(struct inode *inode, struct file *file)
399 /* can not refresh inode info since size could be stale */ 430 /* can not refresh inode info since size could be stale */
400 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 431 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
401 cifs_sb->mnt_file_mode /* ignored */, 432 cifs_sb->mnt_file_mode /* ignored */,
402 file->f_flags, &oplock, &netfid, xid); 433 file->f_flags, &oplock, &fid.netfid, xid);
403 if (rc == 0) { 434 if (rc == 0) {
404 cFYI(1, "posix open succeeded"); 435 cFYI(1, "posix open succeeded");
405 posix_open_ok = true; 436 posix_open_ok = true;
@@ -415,20 +446,34 @@ int cifs_open(struct inode *inode, struct file *file)
415 } else if ((rc != -EIO) && (rc != -EREMOTE) && 446 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
416 (rc != -EOPNOTSUPP)) /* path not found or net err */ 447 (rc != -EOPNOTSUPP)) /* path not found or net err */
417 goto out; 448 goto out;
418 /* else fallthrough to retry open the old way on network i/o 449 /*
419 or DFS errors */ 450 * Else fallthrough to retry open the old way on network i/o
451 * or DFS errors.
452 */
420 } 453 }
421 454
455 if (server->ops->get_lease_key)
456 server->ops->get_lease_key(inode, &fid);
457
458 cifs_add_pending_open(&fid, tlink, &open);
459
422 if (!posix_open_ok) { 460 if (!posix_open_ok) {
461 if (server->ops->get_lease_key)
462 server->ops->get_lease_key(inode, &fid);
463
423 rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, 464 rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
424 file->f_flags, &oplock, &netfid, xid); 465 file->f_flags, &oplock, &fid, xid);
425 if (rc) 466 if (rc) {
467 cifs_del_pending_open(&open);
426 goto out; 468 goto out;
469 }
427 } 470 }
428 471
429 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); 472 cfile = cifs_new_fileinfo(&fid, file, tlink, oplock);
430 if (pCifsFile == NULL) { 473 if (cfile == NULL) {
431 CIFSSMBClose(xid, tcon, netfid); 474 if (server->ops->close)
475 server->ops->close(xid, tcon, &fid);
476 cifs_del_pending_open(&open);
432 rc = -ENOMEM; 477 rc = -ENOMEM;
433 goto out; 478 goto out;
434 } 479 }
@@ -436,8 +481,10 @@ int cifs_open(struct inode *inode, struct file *file)
436 cifs_fscache_set_inode_cookie(inode, file); 481 cifs_fscache_set_inode_cookie(inode, file);
437 482
438 if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { 483 if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
439 /* time to set mode which we can not set earlier due to 484 /*
440 problems creating new read-only files */ 485 * Time to set mode which we can not set earlier due to
486 * problems creating new read-only files.
487 */
441 struct cifs_unix_set_info_args args = { 488 struct cifs_unix_set_info_args args = {
442 .mode = inode->i_mode, 489 .mode = inode->i_mode,
443 .uid = NO_CHANGE_64, 490 .uid = NO_CHANGE_64,
@@ -447,8 +494,8 @@ int cifs_open(struct inode *inode, struct file *file)
447 .mtime = NO_CHANGE_64, 494 .mtime = NO_CHANGE_64,
448 .device = 0, 495 .device = 0,
449 }; 496 };
450 CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid, 497 CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid,
451 pCifsFile->pid); 498 cfile->pid);
452 } 499 }
453 500
454out: 501out:
@@ -458,59 +505,66 @@ out:
458 return rc; 505 return rc;
459} 506}
460 507
461/* Try to reacquire byte range locks that were released when session */ 508/*
462/* to server was lost */ 509 * Try to reacquire byte range locks that were released when session
510 * to server was lost
511 */
463static int cifs_relock_file(struct cifsFileInfo *cifsFile) 512static int cifs_relock_file(struct cifsFileInfo *cifsFile)
464{ 513{
465 int rc = 0; 514 int rc = 0;
466 515
467/* BB list all locks open on this file and relock */ 516 /* BB list all locks open on this file and relock */
468 517
469 return rc; 518 return rc;
470} 519}
471 520
472static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) 521static int
522cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
473{ 523{
474 int rc = -EACCES; 524 int rc = -EACCES;
475 unsigned int xid; 525 unsigned int xid;
476 __u32 oplock; 526 __u32 oplock;
477 struct cifs_sb_info *cifs_sb; 527 struct cifs_sb_info *cifs_sb;
478 struct cifs_tcon *tcon; 528 struct cifs_tcon *tcon;
479 struct cifsInodeInfo *pCifsInode; 529 struct TCP_Server_Info *server;
530 struct cifsInodeInfo *cinode;
480 struct inode *inode; 531 struct inode *inode;
481 char *full_path = NULL; 532 char *full_path = NULL;
482 int desiredAccess; 533 int desired_access;
483 int disposition = FILE_OPEN; 534 int disposition = FILE_OPEN;
484 int create_options = CREATE_NOT_DIR; 535 int create_options = CREATE_NOT_DIR;
485 __u16 netfid; 536 struct cifs_fid fid;
486 537
487 xid = get_xid(); 538 xid = get_xid();
488 mutex_lock(&pCifsFile->fh_mutex); 539 mutex_lock(&cfile->fh_mutex);
489 if (!pCifsFile->invalidHandle) { 540 if (!cfile->invalidHandle) {
490 mutex_unlock(&pCifsFile->fh_mutex); 541 mutex_unlock(&cfile->fh_mutex);
491 rc = 0; 542 rc = 0;
492 free_xid(xid); 543 free_xid(xid);
493 return rc; 544 return rc;
494 } 545 }
495 546
496 inode = pCifsFile->dentry->d_inode; 547 inode = cfile->dentry->d_inode;
497 cifs_sb = CIFS_SB(inode->i_sb); 548 cifs_sb = CIFS_SB(inode->i_sb);
498 tcon = tlink_tcon(pCifsFile->tlink); 549 tcon = tlink_tcon(cfile->tlink);
550 server = tcon->ses->server;
499 551
500/* can not grab rename sem here because various ops, including 552 /*
501 those that already have the rename sem can end up causing writepage 553 * Can not grab rename sem here because various ops, including those
502 to get called and if the server was down that means we end up here, 554 * that already have the rename sem can end up causing writepage to get
503 and we can never tell if the caller already has the rename_sem */ 555 * called and if the server was down that means we end up here, and we
504 full_path = build_path_from_dentry(pCifsFile->dentry); 556 * can never tell if the caller already has the rename_sem.
557 */
558 full_path = build_path_from_dentry(cfile->dentry);
505 if (full_path == NULL) { 559 if (full_path == NULL) {
506 rc = -ENOMEM; 560 rc = -ENOMEM;
507 mutex_unlock(&pCifsFile->fh_mutex); 561 mutex_unlock(&cfile->fh_mutex);
508 free_xid(xid); 562 free_xid(xid);
509 return rc; 563 return rc;
510 } 564 }
511 565
512 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 566 cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags,
513 inode, pCifsFile->f_flags, full_path); 567 full_path);
514 568
515 if (tcon->ses->server->oplocks) 569 if (tcon->ses->server->oplocks)
516 oplock = REQ_OPLOCK; 570 oplock = REQ_OPLOCK;
@@ -524,69 +578,72 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
524 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the 578 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
525 * original open. Must mask them off for a reopen. 579 * original open. Must mask them off for a reopen.
526 */ 580 */
527 unsigned int oflags = pCifsFile->f_flags & 581 unsigned int oflags = cfile->f_flags &
528 ~(O_CREAT | O_EXCL | O_TRUNC); 582 ~(O_CREAT | O_EXCL | O_TRUNC);
529 583
530 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 584 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
531 cifs_sb->mnt_file_mode /* ignored */, 585 cifs_sb->mnt_file_mode /* ignored */,
532 oflags, &oplock, &netfid, xid); 586 oflags, &oplock, &fid.netfid, xid);
533 if (rc == 0) { 587 if (rc == 0) {
534 cFYI(1, "posix reopen succeeded"); 588 cFYI(1, "posix reopen succeeded");
535 goto reopen_success; 589 goto reopen_success;
536 } 590 }
537 /* fallthrough to retry open the old way on errors, especially 591 /*
538 in the reconnect path it is important to retry hard */ 592 * fallthrough to retry open the old way on errors, especially
593 * in the reconnect path it is important to retry hard
594 */
539 } 595 }
540 596
541 desiredAccess = cifs_convert_flags(pCifsFile->f_flags); 597 desired_access = cifs_convert_flags(cfile->f_flags);
542 598
543 if (backup_cred(cifs_sb)) 599 if (backup_cred(cifs_sb))
544 create_options |= CREATE_OPEN_BACKUP_INTENT; 600 create_options |= CREATE_OPEN_BACKUP_INTENT;
545 601
546 /* Can not refresh inode by passing in file_info buf to be returned 602 if (server->ops->get_lease_key)
547 by SMBOpen and then calling get_inode_info with returned buf 603 server->ops->get_lease_key(inode, &fid);
548 since file might have write behind data that needs to be flushed
549 and server version of file size can be stale. If we knew for sure
550 that inode was not dirty locally we could do this */
551 604
552 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, 605 /*
553 create_options, &netfid, &oplock, NULL, 606 * Can not refresh inode by passing in file_info buf to be returned by
554 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 607 * CIFSSMBOpen and then calling get_inode_info with returned buf since
555 CIFS_MOUNT_MAP_SPECIAL_CHR); 608 * file might have write behind data that needs to be flushed and server
609 * version of file size can be stale. If we knew for sure that inode was
610 * not dirty locally we could do this.
611 */
612 rc = server->ops->open(xid, tcon, full_path, disposition,
613 desired_access, create_options, &fid, &oplock,
614 NULL, cifs_sb);
556 if (rc) { 615 if (rc) {
557 mutex_unlock(&pCifsFile->fh_mutex); 616 mutex_unlock(&cfile->fh_mutex);
558 cFYI(1, "cifs_open returned 0x%x", rc); 617 cFYI(1, "cifs_reopen returned 0x%x", rc);
559 cFYI(1, "oplock: %d", oplock); 618 cFYI(1, "oplock: %d", oplock);
560 goto reopen_error_exit; 619 goto reopen_error_exit;
561 } 620 }
562 621
563reopen_success: 622reopen_success:
564 pCifsFile->netfid = netfid; 623 cfile->invalidHandle = false;
565 pCifsFile->invalidHandle = false; 624 mutex_unlock(&cfile->fh_mutex);
566 mutex_unlock(&pCifsFile->fh_mutex); 625 cinode = CIFS_I(inode);
567 pCifsInode = CIFS_I(inode);
568 626
569 if (can_flush) { 627 if (can_flush) {
570 rc = filemap_write_and_wait(inode->i_mapping); 628 rc = filemap_write_and_wait(inode->i_mapping);
571 mapping_set_error(inode->i_mapping, rc); 629 mapping_set_error(inode->i_mapping, rc);
572 630
573 if (tcon->unix_ext) 631 if (tcon->unix_ext)
574 rc = cifs_get_inode_info_unix(&inode, 632 rc = cifs_get_inode_info_unix(&inode, full_path,
575 full_path, inode->i_sb, xid); 633 inode->i_sb, xid);
576 else 634 else
577 rc = cifs_get_inode_info(&inode, 635 rc = cifs_get_inode_info(&inode, full_path, NULL,
578 full_path, NULL, inode->i_sb, 636 inode->i_sb, xid, NULL);
579 xid, NULL); 637 }
580 } /* else we are writing out data to server already 638 /*
581 and could deadlock if we tried to flush data, and 639 * Else we are writing out data to server already and could deadlock if
582 since we do not know if we have data that would 640 * we tried to flush data, and since we do not know if we have data that
583 invalidate the current end of file on the server 641 * would invalidate the current end of file on the server we can not go
584 we can not go to the server to get the new inod 642 * to the server to get the new inode info.
585 info */ 643 */
586
587 cifs_set_oplock_level(pCifsInode, oplock);
588 644
589 cifs_relock_file(pCifsFile); 645 server->ops->set_fid(cfile, &fid, oplock);
646 cifs_relock_file(cfile);
590 647
591reopen_error_exit: 648reopen_error_exit:
592 kfree(full_path); 649 kfree(full_path);
@@ -609,42 +666,48 @@ int cifs_closedir(struct inode *inode, struct file *file)
609{ 666{
610 int rc = 0; 667 int rc = 0;
611 unsigned int xid; 668 unsigned int xid;
612 struct cifsFileInfo *pCFileStruct = file->private_data; 669 struct cifsFileInfo *cfile = file->private_data;
613 char *ptmp; 670 struct cifs_tcon *tcon;
671 struct TCP_Server_Info *server;
672 char *buf;
614 673
615 cFYI(1, "Closedir inode = 0x%p", inode); 674 cFYI(1, "Closedir inode = 0x%p", inode);
616 675
676 if (cfile == NULL)
677 return rc;
678
617 xid = get_xid(); 679 xid = get_xid();
680 tcon = tlink_tcon(cfile->tlink);
681 server = tcon->ses->server;
618 682
619 if (pCFileStruct) { 683 cFYI(1, "Freeing private data in close dir");
620 struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink); 684 spin_lock(&cifs_file_list_lock);
685 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
686 cfile->invalidHandle = true;
687 spin_unlock(&cifs_file_list_lock);
688 if (server->ops->close_dir)
689 rc = server->ops->close_dir(xid, tcon, &cfile->fid);
690 else
691 rc = -ENOSYS;
692 cFYI(1, "Closing uncompleted readdir with rc %d", rc);
693 /* not much we can do if it fails anyway, ignore rc */
694 rc = 0;
695 } else
696 spin_unlock(&cifs_file_list_lock);
621 697
622 cFYI(1, "Freeing private data in close dir"); 698 buf = cfile->srch_inf.ntwrk_buf_start;
623 spin_lock(&cifs_file_list_lock); 699 if (buf) {
624 if (!pCFileStruct->srch_inf.endOfSearch && 700 cFYI(1, "closedir free smb buf in srch struct");
625 !pCFileStruct->invalidHandle) { 701 cfile->srch_inf.ntwrk_buf_start = NULL;
626 pCFileStruct->invalidHandle = true; 702 if (cfile->srch_inf.smallBuf)
627 spin_unlock(&cifs_file_list_lock); 703 cifs_small_buf_release(buf);
628 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 704 else
629 cFYI(1, "Closing uncompleted readdir with rc %d", 705 cifs_buf_release(buf);
630 rc);
631 /* not much we can do if it fails anyway, ignore rc */
632 rc = 0;
633 } else
634 spin_unlock(&cifs_file_list_lock);
635 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
636 if (ptmp) {
637 cFYI(1, "closedir free smb buf in srch struct");
638 pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
639 if (pCFileStruct->srch_inf.smallBuf)
640 cifs_small_buf_release(ptmp);
641 else
642 cifs_buf_release(ptmp);
643 }
644 cifs_put_tlink(pCFileStruct->tlink);
645 kfree(file->private_data);
646 file->private_data = NULL;
647 } 706 }
707
708 cifs_put_tlink(cfile->tlink);
709 kfree(file->private_data);
710 file->private_data = NULL;
648 /* BB can we lock the filestruct while this is going on? */ 711 /* BB can we lock the filestruct while this is going on? */
649 free_xid(xid); 712 free_xid(xid);
650 return rc; 713 return rc;
@@ -666,7 +729,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type)
666 return lock; 729 return lock;
667} 730}
668 731
669static void 732void
670cifs_del_lock_waiters(struct cifsLockInfo *lock) 733cifs_del_lock_waiters(struct cifsLockInfo *lock)
671{ 734{
672 struct cifsLockInfo *li, *tmp; 735 struct cifsLockInfo *li, *tmp;
@@ -677,45 +740,47 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
677} 740}
678 741
679static bool 742static bool
680cifs_find_fid_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, 743cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
681 __u64 length, __u8 type, struct cifsFileInfo *cur, 744 __u64 length, __u8 type, struct cifsFileInfo *cfile,
682 struct cifsLockInfo **conf_lock) 745 struct cifsLockInfo **conf_lock, bool rw_check)
683{ 746{
684 struct cifsLockInfo *li; 747 struct cifsLockInfo *li;
748 struct cifsFileInfo *cur_cfile = fdlocks->cfile;
685 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; 749 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
686 750
687 list_for_each_entry(li, &cfile->llist, llist) { 751 list_for_each_entry(li, &fdlocks->locks, llist) {
688 if (offset + length <= li->offset || 752 if (offset + length <= li->offset ||
689 offset >= li->offset + li->length) 753 offset >= li->offset + li->length)
690 continue; 754 continue;
691 else if ((type & server->vals->shared_lock_type) && 755 if (rw_check && server->ops->compare_fids(cfile, cur_cfile) &&
692 ((server->ops->compare_fids(cur, cfile) && 756 current->tgid == li->pid)
693 current->tgid == li->pid) || type == li->type))
694 continue; 757 continue;
695 else { 758 if ((type & server->vals->shared_lock_type) &&
759 ((server->ops->compare_fids(cfile, cur_cfile) &&
760 current->tgid == li->pid) || type == li->type))
761 continue;
762 if (conf_lock)
696 *conf_lock = li; 763 *conf_lock = li;
697 return true; 764 return true;
698 }
699 } 765 }
700 return false; 766 return false;
701} 767}
702 768
703static bool 769bool
704cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, 770cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
705 __u8 type, struct cifsLockInfo **conf_lock) 771 __u8 type, struct cifsLockInfo **conf_lock,
772 bool rw_check)
706{ 773{
707 bool rc = false; 774 bool rc = false;
708 struct cifsFileInfo *fid, *tmp; 775 struct cifs_fid_locks *cur;
709 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 776 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
710 777
711 spin_lock(&cifs_file_list_lock); 778 list_for_each_entry(cur, &cinode->llist, llist) {
712 list_for_each_entry_safe(fid, tmp, &cinode->openFileList, flist) { 779 rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
713 rc = cifs_find_fid_lock_conflict(fid, offset, length, type, 780 cfile, conf_lock, rw_check);
714 cfile, conf_lock);
715 if (rc) 781 if (rc)
716 break; 782 break;
717 } 783 }
718 spin_unlock(&cifs_file_list_lock);
719 784
720 return rc; 785 return rc;
721} 786}
@@ -737,10 +802,10 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
737 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; 802 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
738 bool exist; 803 bool exist;
739 804
740 mutex_lock(&cinode->lock_mutex); 805 down_read(&cinode->lock_sem);
741 806
742 exist = cifs_find_lock_conflict(cfile, offset, length, type, 807 exist = cifs_find_lock_conflict(cfile, offset, length, type,
743 &conf_lock); 808 &conf_lock, false);
744 if (exist) { 809 if (exist) {
745 flock->fl_start = conf_lock->offset; 810 flock->fl_start = conf_lock->offset;
746 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 811 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -754,7 +819,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
754 else 819 else
755 flock->fl_type = F_UNLCK; 820 flock->fl_type = F_UNLCK;
756 821
757 mutex_unlock(&cinode->lock_mutex); 822 up_read(&cinode->lock_sem);
758 return rc; 823 return rc;
759} 824}
760 825
@@ -762,9 +827,9 @@ static void
762cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) 827cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
763{ 828{
764 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 829 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
765 mutex_lock(&cinode->lock_mutex); 830 down_write(&cinode->lock_sem);
766 list_add_tail(&lock->llist, &cfile->llist); 831 list_add_tail(&lock->llist, &cfile->llist->locks);
767 mutex_unlock(&cinode->lock_mutex); 832 up_write(&cinode->lock_sem);
768} 833}
769 834
770/* 835/*
@@ -784,13 +849,13 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
784 849
785try_again: 850try_again:
786 exist = false; 851 exist = false;
787 mutex_lock(&cinode->lock_mutex); 852 down_write(&cinode->lock_sem);
788 853
789 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, 854 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
790 lock->type, &conf_lock); 855 lock->type, &conf_lock, false);
791 if (!exist && cinode->can_cache_brlcks) { 856 if (!exist && cinode->can_cache_brlcks) {
792 list_add_tail(&lock->llist, &cfile->llist); 857 list_add_tail(&lock->llist, &cfile->llist->locks);
793 mutex_unlock(&cinode->lock_mutex); 858 up_write(&cinode->lock_sem);
794 return rc; 859 return rc;
795 } 860 }
796 861
@@ -800,17 +865,17 @@ try_again:
800 rc = -EACCES; 865 rc = -EACCES;
801 else { 866 else {
802 list_add_tail(&lock->blist, &conf_lock->blist); 867 list_add_tail(&lock->blist, &conf_lock->blist);
803 mutex_unlock(&cinode->lock_mutex); 868 up_write(&cinode->lock_sem);
804 rc = wait_event_interruptible(lock->block_q, 869 rc = wait_event_interruptible(lock->block_q,
805 (lock->blist.prev == &lock->blist) && 870 (lock->blist.prev == &lock->blist) &&
806 (lock->blist.next == &lock->blist)); 871 (lock->blist.next == &lock->blist));
807 if (!rc) 872 if (!rc)
808 goto try_again; 873 goto try_again;
809 mutex_lock(&cinode->lock_mutex); 874 down_write(&cinode->lock_sem);
810 list_del_init(&lock->blist); 875 list_del_init(&lock->blist);
811 } 876 }
812 877
813 mutex_unlock(&cinode->lock_mutex); 878 up_write(&cinode->lock_sem);
814 return rc; 879 return rc;
815} 880}
816 881
@@ -831,7 +896,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
831 if ((flock->fl_flags & FL_POSIX) == 0) 896 if ((flock->fl_flags & FL_POSIX) == 0)
832 return 1; 897 return 1;
833 898
834 mutex_lock(&cinode->lock_mutex); 899 down_read(&cinode->lock_sem);
835 posix_test_lock(file, flock); 900 posix_test_lock(file, flock);
836 901
837 if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { 902 if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
@@ -839,7 +904,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
839 rc = 1; 904 rc = 1;
840 } 905 }
841 906
842 mutex_unlock(&cinode->lock_mutex); 907 up_read(&cinode->lock_sem);
843 return rc; 908 return rc;
844} 909}
845 910
@@ -859,14 +924,14 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
859 return rc; 924 return rc;
860 925
861try_again: 926try_again:
862 mutex_lock(&cinode->lock_mutex); 927 down_write(&cinode->lock_sem);
863 if (!cinode->can_cache_brlcks) { 928 if (!cinode->can_cache_brlcks) {
864 mutex_unlock(&cinode->lock_mutex); 929 up_write(&cinode->lock_sem);
865 return rc; 930 return rc;
866 } 931 }
867 932
868 rc = posix_lock_file(file, flock, NULL); 933 rc = posix_lock_file(file, flock, NULL);
869 mutex_unlock(&cinode->lock_mutex); 934 up_write(&cinode->lock_sem);
870 if (rc == FILE_LOCK_DEFERRED) { 935 if (rc == FILE_LOCK_DEFERRED) {
871 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); 936 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
872 if (!rc) 937 if (!rc)
@@ -876,7 +941,7 @@ try_again:
876 return rc; 941 return rc;
877} 942}
878 943
879static int 944int
880cifs_push_mandatory_locks(struct cifsFileInfo *cfile) 945cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
881{ 946{
882 unsigned int xid; 947 unsigned int xid;
@@ -893,9 +958,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
893 xid = get_xid(); 958 xid = get_xid();
894 tcon = tlink_tcon(cfile->tlink); 959 tcon = tlink_tcon(cfile->tlink);
895 960
896 mutex_lock(&cinode->lock_mutex); 961 /* we are going to update can_cache_brlcks here - need a write access */
962 down_write(&cinode->lock_sem);
897 if (!cinode->can_cache_brlcks) { 963 if (!cinode->can_cache_brlcks) {
898 mutex_unlock(&cinode->lock_mutex); 964 up_write(&cinode->lock_sem);
899 free_xid(xid); 965 free_xid(xid);
900 return rc; 966 return rc;
901 } 967 }
@@ -906,7 +972,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
906 */ 972 */
907 max_buf = tcon->ses->server->maxBuf; 973 max_buf = tcon->ses->server->maxBuf;
908 if (!max_buf) { 974 if (!max_buf) {
909 mutex_unlock(&cinode->lock_mutex); 975 up_write(&cinode->lock_sem);
910 free_xid(xid); 976 free_xid(xid);
911 return -EINVAL; 977 return -EINVAL;
912 } 978 }
@@ -915,15 +981,15 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
915 sizeof(LOCKING_ANDX_RANGE); 981 sizeof(LOCKING_ANDX_RANGE);
916 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 982 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
917 if (!buf) { 983 if (!buf) {
918 mutex_unlock(&cinode->lock_mutex); 984 up_write(&cinode->lock_sem);
919 free_xid(xid); 985 free_xid(xid);
920 return rc; 986 return -ENOMEM;
921 } 987 }
922 988
923 for (i = 0; i < 2; i++) { 989 for (i = 0; i < 2; i++) {
924 cur = buf; 990 cur = buf;
925 num = 0; 991 num = 0;
926 list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { 992 list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
927 if (li->type != types[i]) 993 if (li->type != types[i])
928 continue; 994 continue;
929 cur->Pid = cpu_to_le16(li->pid); 995 cur->Pid = cpu_to_le16(li->pid);
@@ -932,7 +998,8 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
932 cur->OffsetLow = cpu_to_le32((u32)li->offset); 998 cur->OffsetLow = cpu_to_le32((u32)li->offset);
933 cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); 999 cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
934 if (++num == max_num) { 1000 if (++num == max_num) {
935 stored_rc = cifs_lockv(xid, tcon, cfile->netfid, 1001 stored_rc = cifs_lockv(xid, tcon,
1002 cfile->fid.netfid,
936 (__u8)li->type, 0, num, 1003 (__u8)li->type, 0, num,
937 buf); 1004 buf);
938 if (stored_rc) 1005 if (stored_rc)
@@ -944,7 +1011,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
944 } 1011 }
945 1012
946 if (num) { 1013 if (num) {
947 stored_rc = cifs_lockv(xid, tcon, cfile->netfid, 1014 stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
948 (__u8)types[i], 0, num, buf); 1015 (__u8)types[i], 0, num, buf);
949 if (stored_rc) 1016 if (stored_rc)
950 rc = stored_rc; 1017 rc = stored_rc;
@@ -952,7 +1019,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
952 } 1019 }
953 1020
954 cinode->can_cache_brlcks = false; 1021 cinode->can_cache_brlcks = false;
955 mutex_unlock(&cinode->lock_mutex); 1022 up_write(&cinode->lock_sem);
956 1023
957 kfree(buf); 1024 kfree(buf);
958 free_xid(xid); 1025 free_xid(xid);
@@ -987,9 +1054,10 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
987 1054
988 xid = get_xid(); 1055 xid = get_xid();
989 1056
990 mutex_lock(&cinode->lock_mutex); 1057 /* we are going to update can_cache_brlcks here - need a write access */
1058 down_write(&cinode->lock_sem);
991 if (!cinode->can_cache_brlcks) { 1059 if (!cinode->can_cache_brlcks) {
992 mutex_unlock(&cinode->lock_mutex); 1060 up_write(&cinode->lock_sem);
993 free_xid(xid); 1061 free_xid(xid);
994 return rc; 1062 return rc;
995 } 1063 }
@@ -1005,7 +1073,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1005 1073
1006 /* 1074 /*
1007 * Allocating count locks is enough because no FL_POSIX locks can be 1075 * Allocating count locks is enough because no FL_POSIX locks can be
1008 * added to the list while we are holding cinode->lock_mutex that 1076 * added to the list while we are holding cinode->lock_sem that
1009 * protects locking operations of this inode. 1077 * protects locking operations of this inode.
1010 */ 1078 */
1011 for (; i < count; i++) { 1079 for (; i < count; i++) {
@@ -1038,7 +1106,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1038 type = CIFS_WRLCK; 1106 type = CIFS_WRLCK;
1039 lck = list_entry(el, struct lock_to_push, llist); 1107 lck = list_entry(el, struct lock_to_push, llist);
1040 lck->pid = flock->fl_pid; 1108 lck->pid = flock->fl_pid;
1041 lck->netfid = cfile->netfid; 1109 lck->netfid = cfile->fid.netfid;
1042 lck->length = length; 1110 lck->length = length;
1043 lck->type = type; 1111 lck->type = type;
1044 lck->offset = flock->fl_start; 1112 lck->offset = flock->fl_start;
@@ -1060,7 +1128,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1060 1128
1061out: 1129out:
1062 cinode->can_cache_brlcks = false; 1130 cinode->can_cache_brlcks = false;
1063 mutex_unlock(&cinode->lock_mutex); 1131 up_write(&cinode->lock_sem);
1064 1132
1065 free_xid(xid); 1133 free_xid(xid);
1066 return rc; 1134 return rc;
@@ -1083,7 +1151,7 @@ cifs_push_locks(struct cifsFileInfo *cfile)
1083 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1151 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1084 return cifs_push_posix_locks(cfile); 1152 return cifs_push_posix_locks(cfile);
1085 1153
1086 return cifs_push_mandatory_locks(cfile); 1154 return tcon->ses->server->ops->push_mand_locks(cfile);
1087} 1155}
1088 1156
1089static void 1157static void
@@ -1104,7 +1172,8 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
1104 if (flock->fl_flags & FL_LEASE) 1172 if (flock->fl_flags & FL_LEASE)
1105 cFYI(1, "Lease on file - not implemented yet"); 1173 cFYI(1, "Lease on file - not implemented yet");
1106 if (flock->fl_flags & 1174 if (flock->fl_flags &
1107 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 1175 (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
1176 FL_ACCESS | FL_LEASE | FL_CLOSE)))
1108 cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); 1177 cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
1109 1178
1110 *type = server->vals->large_lock_type; 1179 *type = server->vals->large_lock_type;
@@ -1134,15 +1203,6 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
1134} 1203}
1135 1204
1136static int 1205static int
1137cifs_mandatory_lock(unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
1138 __u64 length, __u32 type, int lock, int unlock, bool wait)
1139{
1140 return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->netfid,
1141 current->tgid, length, offset, unlock, lock,
1142 (__u8)type, wait, 0);
1143}
1144
1145static int
1146cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, 1206cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
1147 bool wait_flag, bool posix_lck, unsigned int xid) 1207 bool wait_flag, bool posix_lck, unsigned int xid)
1148{ 1208{
@@ -1151,7 +1211,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
1151 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 1211 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1152 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1212 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1153 struct TCP_Server_Info *server = tcon->ses->server; 1213 struct TCP_Server_Info *server = tcon->ses->server;
1154 __u16 netfid = cfile->netfid; 1214 __u16 netfid = cfile->fid.netfid;
1155 1215
1156 if (posix_lck) { 1216 if (posix_lck) {
1157 int posix_lock_type; 1217 int posix_lock_type;
@@ -1175,11 +1235,11 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
1175 return rc; 1235 return rc;
1176 1236
1177 /* BB we could chain these into one lock request BB */ 1237 /* BB we could chain these into one lock request BB */
1178 rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, type, 1238 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type,
1179 1, 0, false); 1239 1, 0, false);
1180 if (rc == 0) { 1240 if (rc == 0) {
1181 rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, 1241 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1182 type, 0, 1, false); 1242 type, 0, 1, false);
1183 flock->fl_type = F_UNLCK; 1243 flock->fl_type = F_UNLCK;
1184 if (rc != 0) 1244 if (rc != 0)
1185 cERROR(1, "Error unlocking previously locked " 1245 cERROR(1, "Error unlocking previously locked "
@@ -1192,13 +1252,14 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
1192 return 0; 1252 return 0;
1193 } 1253 }
1194 1254
1195 rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, 1255 type &= ~server->vals->exclusive_lock_type;
1196 type | server->vals->shared_lock_type, 1, 0, 1256
1197 false); 1257 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1258 type | server->vals->shared_lock_type,
1259 1, 0, false);
1198 if (rc == 0) { 1260 if (rc == 0) {
1199 rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, 1261 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1200 type | server->vals->shared_lock_type, 1262 type | server->vals->shared_lock_type, 0, 1, false);
1201 0, 1, false);
1202 flock->fl_type = F_RDLCK; 1263 flock->fl_type = F_RDLCK;
1203 if (rc != 0) 1264 if (rc != 0)
1204 cERROR(1, "Error unlocking previously locked " 1265 cERROR(1, "Error unlocking previously locked "
@@ -1209,7 +1270,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
1209 return 0; 1270 return 0;
1210} 1271}
1211 1272
1212static void 1273void
1213cifs_move_llist(struct list_head *source, struct list_head *dest) 1274cifs_move_llist(struct list_head *source, struct list_head *dest)
1214{ 1275{
1215 struct list_head *li, *tmp; 1276 struct list_head *li, *tmp;
@@ -1217,7 +1278,7 @@ cifs_move_llist(struct list_head *source, struct list_head *dest)
1217 list_move(li, dest); 1278 list_move(li, dest);
1218} 1279}
1219 1280
1220static void 1281void
1221cifs_free_llist(struct list_head *llist) 1282cifs_free_llist(struct list_head *llist)
1222{ 1283{
1223 struct cifsLockInfo *li, *tmp; 1284 struct cifsLockInfo *li, *tmp;
@@ -1228,7 +1289,7 @@ cifs_free_llist(struct list_head *llist)
1228 } 1289 }
1229} 1290}
1230 1291
1231static int 1292int
1232cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, 1293cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1233 unsigned int xid) 1294 unsigned int xid)
1234{ 1295{
@@ -1260,11 +1321,11 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1260 if (!buf) 1321 if (!buf)
1261 return -ENOMEM; 1322 return -ENOMEM;
1262 1323
1263 mutex_lock(&cinode->lock_mutex); 1324 down_write(&cinode->lock_sem);
1264 for (i = 0; i < 2; i++) { 1325 for (i = 0; i < 2; i++) {
1265 cur = buf; 1326 cur = buf;
1266 num = 0; 1327 num = 0;
1267 list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { 1328 list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
1268 if (flock->fl_start > li->offset || 1329 if (flock->fl_start > li->offset ||
1269 (flock->fl_start + length) < 1330 (flock->fl_start + length) <
1270 (li->offset + li->length)) 1331 (li->offset + li->length))
@@ -1295,7 +1356,8 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1295 */ 1356 */
1296 list_move(&li->llist, &tmp_llist); 1357 list_move(&li->llist, &tmp_llist);
1297 if (++num == max_num) { 1358 if (++num == max_num) {
1298 stored_rc = cifs_lockv(xid, tcon, cfile->netfid, 1359 stored_rc = cifs_lockv(xid, tcon,
1360 cfile->fid.netfid,
1299 li->type, num, 0, buf); 1361 li->type, num, 0, buf);
1300 if (stored_rc) { 1362 if (stored_rc) {
1301 /* 1363 /*
@@ -1304,7 +1366,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1304 * list to the head of the file's list. 1366 * list to the head of the file's list.
1305 */ 1367 */
1306 cifs_move_llist(&tmp_llist, 1368 cifs_move_llist(&tmp_llist,
1307 &cfile->llist); 1369 &cfile->llist->locks);
1308 rc = stored_rc; 1370 rc = stored_rc;
1309 } else 1371 } else
1310 /* 1372 /*
@@ -1318,23 +1380,24 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1318 cur++; 1380 cur++;
1319 } 1381 }
1320 if (num) { 1382 if (num) {
1321 stored_rc = cifs_lockv(xid, tcon, cfile->netfid, 1383 stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
1322 types[i], num, 0, buf); 1384 types[i], num, 0, buf);
1323 if (stored_rc) { 1385 if (stored_rc) {
1324 cifs_move_llist(&tmp_llist, &cfile->llist); 1386 cifs_move_llist(&tmp_llist,
1387 &cfile->llist->locks);
1325 rc = stored_rc; 1388 rc = stored_rc;
1326 } else 1389 } else
1327 cifs_free_llist(&tmp_llist); 1390 cifs_free_llist(&tmp_llist);
1328 } 1391 }
1329 } 1392 }
1330 1393
1331 mutex_unlock(&cinode->lock_mutex); 1394 up_write(&cinode->lock_sem);
1332 kfree(buf); 1395 kfree(buf);
1333 return rc; 1396 return rc;
1334} 1397}
1335 1398
1336static int 1399static int
1337cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, 1400cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1338 bool wait_flag, bool posix_lck, int lock, int unlock, 1401 bool wait_flag, bool posix_lck, int lock, int unlock,
1339 unsigned int xid) 1402 unsigned int xid)
1340{ 1403{
@@ -1343,7 +1406,6 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1343 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 1406 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1344 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1407 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1345 struct TCP_Server_Info *server = tcon->ses->server; 1408 struct TCP_Server_Info *server = tcon->ses->server;
1346 __u16 netfid = cfile->netfid;
1347 1409
1348 if (posix_lck) { 1410 if (posix_lck) {
1349 int posix_lock_type; 1411 int posix_lock_type;
@@ -1360,9 +1422,9 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1360 if (unlock == 1) 1422 if (unlock == 1)
1361 posix_lock_type = CIFS_UNLCK; 1423 posix_lock_type = CIFS_UNLCK;
1362 1424
1363 rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, 1425 rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
1364 flock->fl_start, length, NULL, 1426 current->tgid, flock->fl_start, length,
1365 posix_lock_type, wait_flag); 1427 NULL, posix_lock_type, wait_flag);
1366 goto out; 1428 goto out;
1367 } 1429 }
1368 1430
@@ -1379,8 +1441,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1379 if (rc <= 0) 1441 if (rc <= 0)
1380 goto out; 1442 goto out;
1381 1443
1382 rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, 1444 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1383 type, 1, 0, wait_flag); 1445 type, 1, 0, wait_flag);
1384 if (rc) { 1446 if (rc) {
1385 kfree(lock); 1447 kfree(lock);
1386 goto out; 1448 goto out;
@@ -1388,7 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1388 1450
1389 cifs_lock_add(cfile, lock); 1451 cifs_lock_add(cfile, lock);
1390 } else if (unlock) 1452 } else if (unlock)
1391 rc = cifs_unlock_range(cfile, flock, xid); 1453 rc = server->ops->mand_unlock_range(cfile, flock, xid);
1392 1454
1393out: 1455out:
1394 if (flock->fl_flags & FL_POSIX) 1456 if (flock->fl_flags & FL_POSIX)
@@ -1423,7 +1485,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
1423 tcon->ses->server); 1485 tcon->ses->server);
1424 1486
1425 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1487 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1426 netfid = cfile->netfid; 1488 netfid = cfile->fid.netfid;
1427 cinode = CIFS_I(file->f_path.dentry->d_inode); 1489 cinode = CIFS_I(file->f_path.dentry->d_inode);
1428 1490
1429 if (cap_unix(tcon->ses) && 1491 if (cap_unix(tcon->ses) &&
@@ -1469,15 +1531,16 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
1469 cifsi->server_eof = end_of_write; 1531 cifsi->server_eof = end_of_write;
1470} 1532}
1471 1533
1472static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, 1534static ssize_t
1473 const char *write_data, size_t write_size, 1535cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
1474 loff_t *poffset) 1536 size_t write_size, loff_t *offset)
1475{ 1537{
1476 int rc = 0; 1538 int rc = 0;
1477 unsigned int bytes_written = 0; 1539 unsigned int bytes_written = 0;
1478 unsigned int total_written; 1540 unsigned int total_written;
1479 struct cifs_sb_info *cifs_sb; 1541 struct cifs_sb_info *cifs_sb;
1480 struct cifs_tcon *pTcon; 1542 struct cifs_tcon *tcon;
1543 struct TCP_Server_Info *server;
1481 unsigned int xid; 1544 unsigned int xid;
1482 struct dentry *dentry = open_file->dentry; 1545 struct dentry *dentry = open_file->dentry;
1483 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); 1546 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
@@ -1486,9 +1549,13 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
1486 cifs_sb = CIFS_SB(dentry->d_sb); 1549 cifs_sb = CIFS_SB(dentry->d_sb);
1487 1550
1488 cFYI(1, "write %zd bytes to offset %lld of %s", write_size, 1551 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1489 *poffset, dentry->d_name.name); 1552 *offset, dentry->d_name.name);
1490 1553
1491 pTcon = tlink_tcon(open_file->tlink); 1554 tcon = tlink_tcon(open_file->tlink);
1555 server = tcon->ses->server;
1556
1557 if (!server->ops->sync_write)
1558 return -ENOSYS;
1492 1559
1493 xid = get_xid(); 1560 xid = get_xid();
1494 1561
@@ -1514,13 +1581,12 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
1514 /* iov[0] is reserved for smb header */ 1581 /* iov[0] is reserved for smb header */
1515 iov[1].iov_base = (char *)write_data + total_written; 1582 iov[1].iov_base = (char *)write_data + total_written;
1516 iov[1].iov_len = len; 1583 iov[1].iov_len = len;
1517 io_parms.netfid = open_file->netfid;
1518 io_parms.pid = pid; 1584 io_parms.pid = pid;
1519 io_parms.tcon = pTcon; 1585 io_parms.tcon = tcon;
1520 io_parms.offset = *poffset; 1586 io_parms.offset = *offset;
1521 io_parms.length = len; 1587 io_parms.length = len;
1522 rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov, 1588 rc = server->ops->sync_write(xid, open_file, &io_parms,
1523 1, 0); 1589 &bytes_written, iov, 1);
1524 } 1590 }
1525 if (rc || (bytes_written == 0)) { 1591 if (rc || (bytes_written == 0)) {
1526 if (total_written) 1592 if (total_written)
@@ -1531,18 +1597,18 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
1531 } 1597 }
1532 } else { 1598 } else {
1533 spin_lock(&dentry->d_inode->i_lock); 1599 spin_lock(&dentry->d_inode->i_lock);
1534 cifs_update_eof(cifsi, *poffset, bytes_written); 1600 cifs_update_eof(cifsi, *offset, bytes_written);
1535 spin_unlock(&dentry->d_inode->i_lock); 1601 spin_unlock(&dentry->d_inode->i_lock);
1536 *poffset += bytes_written; 1602 *offset += bytes_written;
1537 } 1603 }
1538 } 1604 }
1539 1605
1540 cifs_stats_bytes_written(pTcon, total_written); 1606 cifs_stats_bytes_written(tcon, total_written);
1541 1607
1542 if (total_written > 0) { 1608 if (total_written > 0) {
1543 spin_lock(&dentry->d_inode->i_lock); 1609 spin_lock(&dentry->d_inode->i_lock);
1544 if (*poffset > dentry->d_inode->i_size) 1610 if (*offset > dentry->d_inode->i_size)
1545 i_size_write(dentry->d_inode, *poffset); 1611 i_size_write(dentry->d_inode, *offset);
1546 spin_unlock(&dentry->d_inode->i_lock); 1612 spin_unlock(&dentry->d_inode->i_lock);
1547 } 1613 }
1548 mark_inode_dirty_sync(dentry->d_inode); 1614 mark_inode_dirty_sync(dentry->d_inode);
@@ -1718,27 +1784,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1718 return rc; 1784 return rc;
1719} 1785}
1720 1786
1721/*
1722 * Marshal up the iov array, reserving the first one for the header. Also,
1723 * set wdata->bytes.
1724 */
1725static void
1726cifs_writepages_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata)
1727{
1728 int i;
1729 struct inode *inode = wdata->cfile->dentry->d_inode;
1730 loff_t size = i_size_read(inode);
1731
1732 /* marshal up the pages into iov array */
1733 wdata->bytes = 0;
1734 for (i = 0; i < wdata->nr_pages; i++) {
1735 iov[i + 1].iov_len = min(size - page_offset(wdata->pages[i]),
1736 (loff_t)PAGE_CACHE_SIZE);
1737 iov[i + 1].iov_base = kmap(wdata->pages[i]);
1738 wdata->bytes += iov[i + 1].iov_len;
1739 }
1740}
1741
1742static int cifs_writepages(struct address_space *mapping, 1787static int cifs_writepages(struct address_space *mapping,
1743 struct writeback_control *wbc) 1788 struct writeback_control *wbc)
1744{ 1789{
@@ -1746,8 +1791,10 @@ static int cifs_writepages(struct address_space *mapping,
1746 bool done = false, scanned = false, range_whole = false; 1791 bool done = false, scanned = false, range_whole = false;
1747 pgoff_t end, index; 1792 pgoff_t end, index;
1748 struct cifs_writedata *wdata; 1793 struct cifs_writedata *wdata;
1794 struct TCP_Server_Info *server;
1749 struct page *page; 1795 struct page *page;
1750 int rc = 0; 1796 int rc = 0;
1797 loff_t isize = i_size_read(mapping->host);
1751 1798
1752 /* 1799 /*
1753 * If wsize is smaller than the page cache size, default to writing 1800 * If wsize is smaller than the page cache size, default to writing
@@ -1852,7 +1899,7 @@ retry:
1852 */ 1899 */
1853 set_page_writeback(page); 1900 set_page_writeback(page);
1854 1901
1855 if (page_offset(page) >= mapping->host->i_size) { 1902 if (page_offset(page) >= isize) {
1856 done = true; 1903 done = true;
1857 unlock_page(page); 1904 unlock_page(page);
1858 end_page_writeback(page); 1905 end_page_writeback(page);
@@ -1883,7 +1930,12 @@ retry:
1883 wdata->sync_mode = wbc->sync_mode; 1930 wdata->sync_mode = wbc->sync_mode;
1884 wdata->nr_pages = nr_pages; 1931 wdata->nr_pages = nr_pages;
1885 wdata->offset = page_offset(wdata->pages[0]); 1932 wdata->offset = page_offset(wdata->pages[0]);
1886 wdata->marshal_iov = cifs_writepages_marshal_iov; 1933 wdata->pagesz = PAGE_CACHE_SIZE;
1934 wdata->tailsz =
1935 min(isize - page_offset(wdata->pages[nr_pages - 1]),
1936 (loff_t)PAGE_CACHE_SIZE);
1937 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
1938 wdata->tailsz;
1887 1939
1888 do { 1940 do {
1889 if (wdata->cfile != NULL) 1941 if (wdata->cfile != NULL)
@@ -1896,7 +1948,8 @@ retry:
1896 break; 1948 break;
1897 } 1949 }
1898 wdata->pid = wdata->cfile->pid; 1950 wdata->pid = wdata->cfile->pid;
1899 rc = cifs_async_writev(wdata); 1951 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
1952 rc = server->ops->async_writev(wdata);
1900 } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); 1953 } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
1901 1954
1902 for (i = 0; i < nr_pages; ++i) 1955 for (i = 0; i < nr_pages; ++i)
@@ -2054,6 +2107,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
2054 unsigned int xid; 2107 unsigned int xid;
2055 int rc = 0; 2108 int rc = 0;
2056 struct cifs_tcon *tcon; 2109 struct cifs_tcon *tcon;
2110 struct TCP_Server_Info *server;
2057 struct cifsFileInfo *smbfile = file->private_data; 2111 struct cifsFileInfo *smbfile = file->private_data;
2058 struct inode *inode = file->f_path.dentry->d_inode; 2112 struct inode *inode = file->f_path.dentry->d_inode;
2059 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2113 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -2077,8 +2131,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
2077 } 2131 }
2078 2132
2079 tcon = tlink_tcon(smbfile->tlink); 2133 tcon = tlink_tcon(smbfile->tlink);
2080 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 2134 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
2081 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 2135 server = tcon->ses->server;
2136 if (server->ops->flush)
2137 rc = server->ops->flush(xid, tcon, &smbfile->fid);
2138 else
2139 rc = -ENOSYS;
2140 }
2082 2141
2083 free_xid(xid); 2142 free_xid(xid);
2084 mutex_unlock(&inode->i_mutex); 2143 mutex_unlock(&inode->i_mutex);
@@ -2090,6 +2149,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2090 unsigned int xid; 2149 unsigned int xid;
2091 int rc = 0; 2150 int rc = 0;
2092 struct cifs_tcon *tcon; 2151 struct cifs_tcon *tcon;
2152 struct TCP_Server_Info *server;
2093 struct cifsFileInfo *smbfile = file->private_data; 2153 struct cifsFileInfo *smbfile = file->private_data;
2094 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2154 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2095 struct inode *inode = file->f_mapping->host; 2155 struct inode *inode = file->f_mapping->host;
@@ -2105,8 +2165,13 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2105 file->f_path.dentry->d_name.name, datasync); 2165 file->f_path.dentry->d_name.name, datasync);
2106 2166
2107 tcon = tlink_tcon(smbfile->tlink); 2167 tcon = tlink_tcon(smbfile->tlink);
2108 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 2168 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
2109 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 2169 server = tcon->ses->server;
2170 if (server->ops->flush)
2171 rc = server->ops->flush(xid, tcon, &smbfile->fid);
2172 else
2173 rc = -ENOSYS;
2174 }
2110 2175
2111 free_xid(xid); 2176 free_xid(xid);
2112 mutex_unlock(&inode->i_mutex); 2177 mutex_unlock(&inode->i_mutex);
@@ -2172,20 +2237,6 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
2172} 2237}
2173 2238
2174static void 2239static void
2175cifs_uncached_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata)
2176{
2177 int i;
2178 size_t bytes = wdata->bytes;
2179
2180 /* marshal up the pages into iov array */
2181 for (i = 0; i < wdata->nr_pages; i++) {
2182 iov[i + 1].iov_len = min_t(size_t, bytes, PAGE_SIZE);
2183 iov[i + 1].iov_base = kmap(wdata->pages[i]);
2184 bytes -= iov[i + 1].iov_len;
2185 }
2186}
2187
2188static void
2189cifs_uncached_writev_complete(struct work_struct *work) 2240cifs_uncached_writev_complete(struct work_struct *work)
2190{ 2241{
2191 int i; 2242 int i;
@@ -2215,6 +2266,9 @@ static int
2215cifs_uncached_retry_writev(struct cifs_writedata *wdata) 2266cifs_uncached_retry_writev(struct cifs_writedata *wdata)
2216{ 2267{
2217 int rc; 2268 int rc;
2269 struct TCP_Server_Info *server;
2270
2271 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
2218 2272
2219 do { 2273 do {
2220 if (wdata->cfile->invalidHandle) { 2274 if (wdata->cfile->invalidHandle) {
@@ -2222,7 +2276,7 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
2222 if (rc != 0) 2276 if (rc != 0)
2223 continue; 2277 continue;
2224 } 2278 }
2225 rc = cifs_async_writev(wdata); 2279 rc = server->ops->async_writev(wdata);
2226 } while (rc == -EAGAIN); 2280 } while (rc == -EAGAIN);
2227 2281
2228 return rc; 2282 return rc;
@@ -2257,6 +2311,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2257 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2311 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2258 open_file = file->private_data; 2312 open_file = file->private_data;
2259 tcon = tlink_tcon(open_file->tlink); 2313 tcon = tlink_tcon(open_file->tlink);
2314
2315 if (!tcon->ses->server->ops->async_writev)
2316 return -ENOSYS;
2317
2260 offset = *poffset; 2318 offset = *poffset;
2261 2319
2262 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2320 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
@@ -2298,7 +2356,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2298 wdata->cfile = cifsFileInfo_get(open_file); 2356 wdata->cfile = cifsFileInfo_get(open_file);
2299 wdata->pid = pid; 2357 wdata->pid = pid;
2300 wdata->bytes = cur_len; 2358 wdata->bytes = cur_len;
2301 wdata->marshal_iov = cifs_uncached_marshal_iov; 2359 wdata->pagesz = PAGE_SIZE;
2360 wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
2302 rc = cifs_uncached_retry_writev(wdata); 2361 rc = cifs_uncached_retry_writev(wdata);
2303 if (rc) { 2362 if (rc) {
2304 kref_put(&wdata->refcount, cifs_writedata_release); 2363 kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2376,40 +2435,110 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
2376 return written; 2435 return written;
2377} 2436}
2378 2437
2379ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, 2438static ssize_t
2380 unsigned long nr_segs, loff_t pos) 2439cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2440 unsigned long nr_segs, loff_t pos)
2381{ 2441{
2382 struct inode *inode; 2442 struct file *file = iocb->ki_filp;
2443 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
2444 struct inode *inode = file->f_mapping->host;
2445 struct cifsInodeInfo *cinode = CIFS_I(inode);
2446 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
2447 ssize_t rc = -EACCES;
2383 2448
2384 inode = iocb->ki_filp->f_path.dentry->d_inode; 2449 BUG_ON(iocb->ki_pos != pos);
2385 2450
2386 if (CIFS_I(inode)->clientCanCacheAll) 2451 sb_start_write(inode->i_sb);
2387 return generic_file_aio_write(iocb, iov, nr_segs, pos);
2388 2452
2389 /* 2453 /*
2390 * In strict cache mode we need to write the data to the server exactly 2454 * We need to hold the sem to be sure nobody modifies lock list
2391 * from the pos to pos+len-1 rather than flush all affected pages 2455 * with a brlock that prevents writing.
2392 * because it may cause a error with mandatory locks on these pages but
2393 * not on the region from pos to ppos+len-1.
2394 */ 2456 */
2457 down_read(&cinode->lock_sem);
2458 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2459 server->vals->exclusive_lock_type, NULL,
2460 true)) {
2461 mutex_lock(&inode->i_mutex);
2462 rc = __generic_file_aio_write(iocb, iov, nr_segs,
2463 &iocb->ki_pos);
2464 mutex_unlock(&inode->i_mutex);
2465 }
2395 2466
2396 return cifs_user_writev(iocb, iov, nr_segs, pos); 2467 if (rc > 0 || rc == -EIOCBQUEUED) {
2468 ssize_t err;
2469
2470 err = generic_write_sync(file, pos, rc);
2471 if (err < 0 && rc > 0)
2472 rc = err;
2473 }
2474
2475 up_read(&cinode->lock_sem);
2476 sb_end_write(inode->i_sb);
2477 return rc;
2478}
2479
2480ssize_t
2481cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2482 unsigned long nr_segs, loff_t pos)
2483{
2484 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2485 struct cifsInodeInfo *cinode = CIFS_I(inode);
2486 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2487 struct cifsFileInfo *cfile = (struct cifsFileInfo *)
2488 iocb->ki_filp->private_data;
2489 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2490
2491#ifdef CONFIG_CIFS_SMB2
2492 /*
2493 * If we have an oplock for read and want to write a data to the file
2494 * we need to store it in the page cache and then push it to the server
2495 * to be sure the next read will get a valid data.
2496 */
2497 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) {
2498 ssize_t written;
2499 int rc;
2500
2501 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
2502 rc = filemap_fdatawrite(inode->i_mapping);
2503 if (rc)
2504 return (ssize_t)rc;
2505
2506 return written;
2507 }
2508#endif
2509
2510 /*
2511 * For non-oplocked files in strict cache mode we need to write the data
2512 * to the server exactly from the pos to pos+len-1 rather than flush all
2513 * affected pages because it may cause a error with mandatory locks on
2514 * these pages but not on the region from pos to ppos+len-1.
2515 */
2516
2517 if (!cinode->clientCanCacheAll)
2518 return cifs_user_writev(iocb, iov, nr_segs, pos);
2519
2520 if (cap_unix(tcon->ses) &&
2521 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
2522 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
2523 return generic_file_aio_write(iocb, iov, nr_segs, pos);
2524
2525 return cifs_writev(iocb, iov, nr_segs, pos);
2397} 2526}
2398 2527
2399static struct cifs_readdata * 2528static struct cifs_readdata *
2400cifs_readdata_alloc(unsigned int nr_vecs, work_func_t complete) 2529cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
2401{ 2530{
2402 struct cifs_readdata *rdata; 2531 struct cifs_readdata *rdata;
2403 2532
2404 rdata = kzalloc(sizeof(*rdata) + 2533 rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
2405 sizeof(struct kvec) * nr_vecs, GFP_KERNEL); 2534 GFP_KERNEL);
2406 if (rdata != NULL) { 2535 if (rdata != NULL) {
2407 kref_init(&rdata->refcount); 2536 kref_init(&rdata->refcount);
2408 INIT_LIST_HEAD(&rdata->list); 2537 INIT_LIST_HEAD(&rdata->list);
2409 init_completion(&rdata->done); 2538 init_completion(&rdata->done);
2410 INIT_WORK(&rdata->work, complete); 2539 INIT_WORK(&rdata->work, complete);
2411 INIT_LIST_HEAD(&rdata->pages);
2412 } 2540 }
2541
2413 return rdata; 2542 return rdata;
2414} 2543}
2415 2544
@@ -2426,25 +2555,25 @@ cifs_readdata_release(struct kref *refcount)
2426} 2555}
2427 2556
2428static int 2557static int
2429cifs_read_allocate_pages(struct list_head *list, unsigned int npages) 2558cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
2430{ 2559{
2431 int rc = 0; 2560 int rc = 0;
2432 struct page *page, *tpage; 2561 struct page *page;
2433 unsigned int i; 2562 unsigned int i;
2434 2563
2435 for (i = 0; i < npages; i++) { 2564 for (i = 0; i < nr_pages; i++) {
2436 page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); 2565 page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
2437 if (!page) { 2566 if (!page) {
2438 rc = -ENOMEM; 2567 rc = -ENOMEM;
2439 break; 2568 break;
2440 } 2569 }
2441 list_add(&page->lru, list); 2570 rdata->pages[i] = page;
2442 } 2571 }
2443 2572
2444 if (rc) { 2573 if (rc) {
2445 list_for_each_entry_safe(page, tpage, list, lru) { 2574 for (i = 0; i < nr_pages; i++) {
2446 list_del(&page->lru); 2575 put_page(rdata->pages[i]);
2447 put_page(page); 2576 rdata->pages[i] = NULL;
2448 } 2577 }
2449 } 2578 }
2450 return rc; 2579 return rc;
@@ -2453,13 +2582,13 @@ cifs_read_allocate_pages(struct list_head *list, unsigned int npages)
2453static void 2582static void
2454cifs_uncached_readdata_release(struct kref *refcount) 2583cifs_uncached_readdata_release(struct kref *refcount)
2455{ 2584{
2456 struct page *page, *tpage;
2457 struct cifs_readdata *rdata = container_of(refcount, 2585 struct cifs_readdata *rdata = container_of(refcount,
2458 struct cifs_readdata, refcount); 2586 struct cifs_readdata, refcount);
2587 unsigned int i;
2459 2588
2460 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { 2589 for (i = 0; i < rdata->nr_pages; i++) {
2461 list_del(&page->lru); 2590 put_page(rdata->pages[i]);
2462 put_page(page); 2591 rdata->pages[i] = NULL;
2463 } 2592 }
2464 cifs_readdata_release(refcount); 2593 cifs_readdata_release(refcount);
2465} 2594}
@@ -2468,6 +2597,9 @@ static int
2468cifs_retry_async_readv(struct cifs_readdata *rdata) 2597cifs_retry_async_readv(struct cifs_readdata *rdata)
2469{ 2598{
2470 int rc; 2599 int rc;
2600 struct TCP_Server_Info *server;
2601
2602 server = tlink_tcon(rdata->cfile->tlink)->ses->server;
2471 2603
2472 do { 2604 do {
2473 if (rdata->cfile->invalidHandle) { 2605 if (rdata->cfile->invalidHandle) {
@@ -2475,7 +2607,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
2475 if (rc != 0) 2607 if (rc != 0)
2476 continue; 2608 continue;
2477 } 2609 }
2478 rc = cifs_async_readv(rdata); 2610 rc = server->ops->async_readv(rdata);
2479 } while (rc == -EAGAIN); 2611 } while (rc == -EAGAIN);
2480 2612
2481 return rc; 2613 return rc;
@@ -2500,17 +2632,18 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
2500 int rc = 0; 2632 int rc = 0;
2501 struct iov_iter ii; 2633 struct iov_iter ii;
2502 size_t pos = rdata->offset - offset; 2634 size_t pos = rdata->offset - offset;
2503 struct page *page, *tpage;
2504 ssize_t remaining = rdata->bytes; 2635 ssize_t remaining = rdata->bytes;
2505 unsigned char *pdata; 2636 unsigned char *pdata;
2637 unsigned int i;
2506 2638
2507 /* set up iov_iter and advance to the correct offset */ 2639 /* set up iov_iter and advance to the correct offset */
2508 iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); 2640 iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
2509 iov_iter_advance(&ii, pos); 2641 iov_iter_advance(&ii, pos);
2510 2642
2511 *copied = 0; 2643 *copied = 0;
2512 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { 2644 for (i = 0; i < rdata->nr_pages; i++) {
2513 ssize_t copy; 2645 ssize_t copy;
2646 struct page *page = rdata->pages[i];
2514 2647
2515 /* copy a whole page or whatever's left */ 2648 /* copy a whole page or whatever's left */
2516 copy = min_t(ssize_t, remaining, PAGE_SIZE); 2649 copy = min_t(ssize_t, remaining, PAGE_SIZE);
@@ -2530,9 +2663,6 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
2530 iov_iter_advance(&ii, copy); 2663 iov_iter_advance(&ii, copy);
2531 } 2664 }
2532 } 2665 }
2533
2534 list_del(&page->lru);
2535 put_page(page);
2536 } 2666 }
2537 2667
2538 return rc; 2668 return rc;
@@ -2544,59 +2674,56 @@ cifs_uncached_readv_complete(struct work_struct *work)
2544 struct cifs_readdata *rdata = container_of(work, 2674 struct cifs_readdata *rdata = container_of(work,
2545 struct cifs_readdata, work); 2675 struct cifs_readdata, work);
2546 2676
2547 /* if the result is non-zero then the pages weren't kmapped */
2548 if (rdata->result == 0) {
2549 struct page *page;
2550
2551 list_for_each_entry(page, &rdata->pages, lru)
2552 kunmap(page);
2553 }
2554
2555 complete(&rdata->done); 2677 complete(&rdata->done);
2556 kref_put(&rdata->refcount, cifs_uncached_readdata_release); 2678 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
2557} 2679}
2558 2680
2559static int 2681static int
2560cifs_uncached_read_marshal_iov(struct cifs_readdata *rdata, 2682cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
2561 unsigned int remaining) 2683 struct cifs_readdata *rdata, unsigned int len)
2562{ 2684{
2563 int len = 0; 2685 int total_read = 0, result = 0;
2564 struct page *page, *tpage; 2686 unsigned int i;
2687 unsigned int nr_pages = rdata->nr_pages;
2688 struct kvec iov;
2689
2690 rdata->tailsz = PAGE_SIZE;
2691 for (i = 0; i < nr_pages; i++) {
2692 struct page *page = rdata->pages[i];
2565 2693
2566 rdata->nr_iov = 1; 2694 if (len >= PAGE_SIZE) {
2567 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
2568 if (remaining >= PAGE_SIZE) {
2569 /* enough data to fill the page */ 2695 /* enough data to fill the page */
2570 rdata->iov[rdata->nr_iov].iov_base = kmap(page); 2696 iov.iov_base = kmap(page);
2571 rdata->iov[rdata->nr_iov].iov_len = PAGE_SIZE; 2697 iov.iov_len = PAGE_SIZE;
2572 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", 2698 cFYI(1, "%u: iov_base=%p iov_len=%zu",
2573 rdata->nr_iov, page->index, 2699 i, iov.iov_base, iov.iov_len);
2574 rdata->iov[rdata->nr_iov].iov_base, 2700 len -= PAGE_SIZE;
2575 rdata->iov[rdata->nr_iov].iov_len); 2701 } else if (len > 0) {
2576 ++rdata->nr_iov;
2577 len += PAGE_SIZE;
2578 remaining -= PAGE_SIZE;
2579 } else if (remaining > 0) {
2580 /* enough for partial page, fill and zero the rest */ 2702 /* enough for partial page, fill and zero the rest */
2581 rdata->iov[rdata->nr_iov].iov_base = kmap(page); 2703 iov.iov_base = kmap(page);
2582 rdata->iov[rdata->nr_iov].iov_len = remaining; 2704 iov.iov_len = len;
2583 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", 2705 cFYI(1, "%u: iov_base=%p iov_len=%zu",
2584 rdata->nr_iov, page->index, 2706 i, iov.iov_base, iov.iov_len);
2585 rdata->iov[rdata->nr_iov].iov_base, 2707 memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
2586 rdata->iov[rdata->nr_iov].iov_len); 2708 rdata->tailsz = len;
2587 memset(rdata->iov[rdata->nr_iov].iov_base + remaining, 2709 len = 0;
2588 '\0', PAGE_SIZE - remaining);
2589 ++rdata->nr_iov;
2590 len += remaining;
2591 remaining = 0;
2592 } else { 2710 } else {
2593 /* no need to hold page hostage */ 2711 /* no need to hold page hostage */
2594 list_del(&page->lru); 2712 rdata->pages[i] = NULL;
2713 rdata->nr_pages--;
2595 put_page(page); 2714 put_page(page);
2715 continue;
2596 } 2716 }
2717
2718 result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
2719 kunmap(page);
2720 if (result < 0)
2721 break;
2722
2723 total_read += result;
2597 } 2724 }
2598 2725
2599 return len; 2726 return total_read > 0 ? total_read : result;
2600} 2727}
2601 2728
2602static ssize_t 2729static ssize_t
@@ -2627,6 +2754,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
2627 open_file = file->private_data; 2754 open_file = file->private_data;
2628 tcon = tlink_tcon(open_file->tlink); 2755 tcon = tlink_tcon(open_file->tlink);
2629 2756
2757 if (!tcon->ses->server->ops->async_readv)
2758 return -ENOSYS;
2759
2630 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2760 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2631 pid = open_file->pid; 2761 pid = open_file->pid;
2632 else 2762 else
@@ -2647,15 +2777,17 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
2647 goto error; 2777 goto error;
2648 } 2778 }
2649 2779
2650 rc = cifs_read_allocate_pages(&rdata->pages, npages); 2780 rc = cifs_read_allocate_pages(rdata, npages);
2651 if (rc) 2781 if (rc)
2652 goto error; 2782 goto error;
2653 2783
2654 rdata->cfile = cifsFileInfo_get(open_file); 2784 rdata->cfile = cifsFileInfo_get(open_file);
2785 rdata->nr_pages = npages;
2655 rdata->offset = offset; 2786 rdata->offset = offset;
2656 rdata->bytes = cur_len; 2787 rdata->bytes = cur_len;
2657 rdata->pid = pid; 2788 rdata->pid = pid;
2658 rdata->marshal_iov = cifs_uncached_read_marshal_iov; 2789 rdata->pagesz = PAGE_SIZE;
2790 rdata->read_into_pages = cifs_uncached_read_into_pages;
2659 2791
2660 rc = cifs_retry_async_readv(rdata); 2792 rc = cifs_retry_async_readv(rdata);
2661error: 2793error:
@@ -2706,6 +2838,10 @@ restart_loop:
2706 cifs_stats_bytes_read(tcon, total_read); 2838 cifs_stats_bytes_read(tcon, total_read);
2707 *poffset += total_read; 2839 *poffset += total_read;
2708 2840
2841 /* mask nodata case */
2842 if (rc == -ENODATA)
2843 rc = 0;
2844
2709 return total_read ? total_read : rc; 2845 return total_read ? total_read : rc;
2710} 2846}
2711 2847
@@ -2721,15 +2857,17 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
2721 return read; 2857 return read;
2722} 2858}
2723 2859
2724ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, 2860ssize_t
2725 unsigned long nr_segs, loff_t pos) 2861cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2862 unsigned long nr_segs, loff_t pos)
2726{ 2863{
2727 struct inode *inode; 2864 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2728 2865 struct cifsInodeInfo *cinode = CIFS_I(inode);
2729 inode = iocb->ki_filp->f_path.dentry->d_inode; 2866 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2730 2867 struct cifsFileInfo *cfile = (struct cifsFileInfo *)
2731 if (CIFS_I(inode)->clientCanCacheRead) 2868 iocb->ki_filp->private_data;
2732 return generic_file_aio_read(iocb, iov, nr_segs, pos); 2869 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2870 int rc = -EACCES;
2733 2871
2734 /* 2872 /*
2735 * In strict cache mode we need to read from the server all the time 2873 * In strict cache mode we need to read from the server all the time
@@ -2739,12 +2877,29 @@ ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2739 * on pages affected by this read but not on the region from pos to 2877 * on pages affected by this read but not on the region from pos to
2740 * pos+len-1. 2878 * pos+len-1.
2741 */ 2879 */
2880 if (!cinode->clientCanCacheRead)
2881 return cifs_user_readv(iocb, iov, nr_segs, pos);
2882
2883 if (cap_unix(tcon->ses) &&
2884 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
2885 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
2886 return generic_file_aio_read(iocb, iov, nr_segs, pos);
2742 2887
2743 return cifs_user_readv(iocb, iov, nr_segs, pos); 2888 /*
2889 * We need to hold the sem to be sure nobody modifies lock list
2890 * with a brlock that prevents reading.
2891 */
2892 down_read(&cinode->lock_sem);
2893 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2894 tcon->ses->server->vals->shared_lock_type,
2895 NULL, true))
2896 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
2897 up_read(&cinode->lock_sem);
2898 return rc;
2744} 2899}
2745 2900
2746static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, 2901static ssize_t
2747 loff_t *poffset) 2902cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
2748{ 2903{
2749 int rc = -EACCES; 2904 int rc = -EACCES;
2750 unsigned int bytes_read = 0; 2905 unsigned int bytes_read = 0;
@@ -2753,8 +2908,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2753 unsigned int rsize; 2908 unsigned int rsize;
2754 struct cifs_sb_info *cifs_sb; 2909 struct cifs_sb_info *cifs_sb;
2755 struct cifs_tcon *tcon; 2910 struct cifs_tcon *tcon;
2911 struct TCP_Server_Info *server;
2756 unsigned int xid; 2912 unsigned int xid;
2757 char *current_offset; 2913 char *cur_offset;
2758 struct cifsFileInfo *open_file; 2914 struct cifsFileInfo *open_file;
2759 struct cifs_io_parms io_parms; 2915 struct cifs_io_parms io_parms;
2760 int buf_type = CIFS_NO_BUFFER; 2916 int buf_type = CIFS_NO_BUFFER;
@@ -2773,6 +2929,12 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2773 } 2929 }
2774 open_file = file->private_data; 2930 open_file = file->private_data;
2775 tcon = tlink_tcon(open_file->tlink); 2931 tcon = tlink_tcon(open_file->tlink);
2932 server = tcon->ses->server;
2933
2934 if (!server->ops->sync_read) {
2935 free_xid(xid);
2936 return -ENOSYS;
2937 }
2776 2938
2777 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2939 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2778 pid = open_file->pid; 2940 pid = open_file->pid;
@@ -2782,9 +2944,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2782 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 2944 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
2783 cFYI(1, "attempting read on write only file instance"); 2945 cFYI(1, "attempting read on write only file instance");
2784 2946
2785 for (total_read = 0, current_offset = read_data; 2947 for (total_read = 0, cur_offset = read_data; read_size > total_read;
2786 read_size > total_read; 2948 total_read += bytes_read, cur_offset += bytes_read) {
2787 total_read += bytes_read, current_offset += bytes_read) {
2788 current_read_size = min_t(uint, read_size - total_read, rsize); 2949 current_read_size = min_t(uint, read_size - total_read, rsize);
2789 /* 2950 /*
2790 * For windows me and 9x we do not want to request more than it 2951 * For windows me and 9x we do not want to request more than it
@@ -2802,13 +2963,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2802 if (rc != 0) 2963 if (rc != 0)
2803 break; 2964 break;
2804 } 2965 }
2805 io_parms.netfid = open_file->netfid;
2806 io_parms.pid = pid; 2966 io_parms.pid = pid;
2807 io_parms.tcon = tcon; 2967 io_parms.tcon = tcon;
2808 io_parms.offset = *poffset; 2968 io_parms.offset = *offset;
2809 io_parms.length = current_read_size; 2969 io_parms.length = current_read_size;
2810 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, 2970 rc = server->ops->sync_read(xid, open_file, &io_parms,
2811 &current_offset, &buf_type); 2971 &bytes_read, &cur_offset,
2972 &buf_type);
2812 } 2973 }
2813 if (rc || (bytes_read == 0)) { 2974 if (rc || (bytes_read == 0)) {
2814 if (total_read) { 2975 if (total_read) {
@@ -2819,7 +2980,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2819 } 2980 }
2820 } else { 2981 } else {
2821 cifs_stats_bytes_read(tcon, total_read); 2982 cifs_stats_bytes_read(tcon, total_read);
2822 *poffset += bytes_read; 2983 *offset += bytes_read;
2823 } 2984 }
2824 } 2985 }
2825 free_xid(xid); 2986 free_xid(xid);
@@ -2842,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2842static struct vm_operations_struct cifs_file_vm_ops = { 3003static struct vm_operations_struct cifs_file_vm_ops = {
2843 .fault = filemap_fault, 3004 .fault = filemap_fault,
2844 .page_mkwrite = cifs_page_mkwrite, 3005 .page_mkwrite = cifs_page_mkwrite,
3006 .remap_pages = generic_file_remap_pages,
2845}; 3007};
2846 3008
2847int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) 3009int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
@@ -2885,16 +3047,16 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
2885static void 3047static void
2886cifs_readv_complete(struct work_struct *work) 3048cifs_readv_complete(struct work_struct *work)
2887{ 3049{
3050 unsigned int i;
2888 struct cifs_readdata *rdata = container_of(work, 3051 struct cifs_readdata *rdata = container_of(work,
2889 struct cifs_readdata, work); 3052 struct cifs_readdata, work);
2890 struct page *page, *tpage;
2891 3053
2892 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { 3054 for (i = 0; i < rdata->nr_pages; i++) {
2893 list_del(&page->lru); 3055 struct page *page = rdata->pages[i];
3056
2894 lru_cache_add_file(page); 3057 lru_cache_add_file(page);
2895 3058
2896 if (rdata->result == 0) { 3059 if (rdata->result == 0) {
2897 kunmap(page);
2898 flush_dcache_page(page); 3060 flush_dcache_page(page);
2899 SetPageUptodate(page); 3061 SetPageUptodate(page);
2900 } 3062 }
@@ -2905,49 +3067,48 @@ cifs_readv_complete(struct work_struct *work)
2905 cifs_readpage_to_fscache(rdata->mapping->host, page); 3067 cifs_readpage_to_fscache(rdata->mapping->host, page);
2906 3068
2907 page_cache_release(page); 3069 page_cache_release(page);
3070 rdata->pages[i] = NULL;
2908 } 3071 }
2909 kref_put(&rdata->refcount, cifs_readdata_release); 3072 kref_put(&rdata->refcount, cifs_readdata_release);
2910} 3073}
2911 3074
2912static int 3075static int
2913cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining) 3076cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
3077 struct cifs_readdata *rdata, unsigned int len)
2914{ 3078{
2915 int len = 0; 3079 int total_read = 0, result = 0;
2916 struct page *page, *tpage; 3080 unsigned int i;
2917 u64 eof; 3081 u64 eof;
2918 pgoff_t eof_index; 3082 pgoff_t eof_index;
3083 unsigned int nr_pages = rdata->nr_pages;
3084 struct kvec iov;
2919 3085
2920 /* determine the eof that the server (probably) has */ 3086 /* determine the eof that the server (probably) has */
2921 eof = CIFS_I(rdata->mapping->host)->server_eof; 3087 eof = CIFS_I(rdata->mapping->host)->server_eof;
2922 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; 3088 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
2923 cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); 3089 cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
2924 3090
2925 rdata->nr_iov = 1; 3091 rdata->tailsz = PAGE_CACHE_SIZE;
2926 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { 3092 for (i = 0; i < nr_pages; i++) {
2927 if (remaining >= PAGE_CACHE_SIZE) { 3093 struct page *page = rdata->pages[i];
3094
3095 if (len >= PAGE_CACHE_SIZE) {
2928 /* enough data to fill the page */ 3096 /* enough data to fill the page */
2929 rdata->iov[rdata->nr_iov].iov_base = kmap(page); 3097 iov.iov_base = kmap(page);
2930 rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; 3098 iov.iov_len = PAGE_CACHE_SIZE;
2931 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", 3099 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
2932 rdata->nr_iov, page->index, 3100 i, page->index, iov.iov_base, iov.iov_len);
2933 rdata->iov[rdata->nr_iov].iov_base, 3101 len -= PAGE_CACHE_SIZE;
2934 rdata->iov[rdata->nr_iov].iov_len); 3102 } else if (len > 0) {
2935 ++rdata->nr_iov;
2936 len += PAGE_CACHE_SIZE;
2937 remaining -= PAGE_CACHE_SIZE;
2938 } else if (remaining > 0) {
2939 /* enough for partial page, fill and zero the rest */ 3103 /* enough for partial page, fill and zero the rest */
2940 rdata->iov[rdata->nr_iov].iov_base = kmap(page); 3104 iov.iov_base = kmap(page);
2941 rdata->iov[rdata->nr_iov].iov_len = remaining; 3105 iov.iov_len = len;
2942 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", 3106 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
2943 rdata->nr_iov, page->index, 3107 i, page->index, iov.iov_base, iov.iov_len);
2944 rdata->iov[rdata->nr_iov].iov_base, 3108 memset(iov.iov_base + len,
2945 rdata->iov[rdata->nr_iov].iov_len); 3109 '\0', PAGE_CACHE_SIZE - len);
2946 memset(rdata->iov[rdata->nr_iov].iov_base + remaining, 3110 rdata->tailsz = len;
2947 '\0', PAGE_CACHE_SIZE - remaining); 3111 len = 0;
2948 ++rdata->nr_iov;
2949 len += remaining;
2950 remaining = 0;
2951 } else if (page->index > eof_index) { 3112 } else if (page->index > eof_index) {
2952 /* 3113 /*
2953 * The VFS will not try to do readahead past the 3114 * The VFS will not try to do readahead past the
@@ -2958,22 +3119,33 @@ cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining)
2958 * fill them until the writes are flushed. 3119 * fill them until the writes are flushed.
2959 */ 3120 */
2960 zero_user(page, 0, PAGE_CACHE_SIZE); 3121 zero_user(page, 0, PAGE_CACHE_SIZE);
2961 list_del(&page->lru);
2962 lru_cache_add_file(page); 3122 lru_cache_add_file(page);
2963 flush_dcache_page(page); 3123 flush_dcache_page(page);
2964 SetPageUptodate(page); 3124 SetPageUptodate(page);
2965 unlock_page(page); 3125 unlock_page(page);
2966 page_cache_release(page); 3126 page_cache_release(page);
3127 rdata->pages[i] = NULL;
3128 rdata->nr_pages--;
3129 continue;
2967 } else { 3130 } else {
2968 /* no need to hold page hostage */ 3131 /* no need to hold page hostage */
2969 list_del(&page->lru);
2970 lru_cache_add_file(page); 3132 lru_cache_add_file(page);
2971 unlock_page(page); 3133 unlock_page(page);
2972 page_cache_release(page); 3134 page_cache_release(page);
3135 rdata->pages[i] = NULL;
3136 rdata->nr_pages--;
3137 continue;
2973 } 3138 }
3139
3140 result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
3141 kunmap(page);
3142 if (result < 0)
3143 break;
3144
3145 total_read += result;
2974 } 3146 }
2975 3147
2976 return len; 3148 return total_read > 0 ? total_read : result;
2977} 3149}
2978 3150
2979static int cifs_readpages(struct file *file, struct address_space *mapping, 3151static int cifs_readpages(struct file *file, struct address_space *mapping,
@@ -3027,6 +3199,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3027 * the rdata->pages, then we want them in increasing order. 3199 * the rdata->pages, then we want them in increasing order.
3028 */ 3200 */
3029 while (!list_empty(page_list)) { 3201 while (!list_empty(page_list)) {
3202 unsigned int i;
3030 unsigned int bytes = PAGE_CACHE_SIZE; 3203 unsigned int bytes = PAGE_CACHE_SIZE;
3031 unsigned int expected_index; 3204 unsigned int expected_index;
3032 unsigned int nr_pages = 1; 3205 unsigned int nr_pages = 1;
@@ -3096,14 +3269,18 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3096 rdata->offset = offset; 3269 rdata->offset = offset;
3097 rdata->bytes = bytes; 3270 rdata->bytes = bytes;
3098 rdata->pid = pid; 3271 rdata->pid = pid;
3099 rdata->marshal_iov = cifs_readpages_marshal_iov; 3272 rdata->pagesz = PAGE_CACHE_SIZE;
3100 list_splice_init(&tmplist, &rdata->pages); 3273 rdata->read_into_pages = cifs_readpages_read_into_pages;
3274
3275 list_for_each_entry_safe(page, tpage, &tmplist, lru) {
3276 list_del(&page->lru);
3277 rdata->pages[rdata->nr_pages++] = page;
3278 }
3101 3279
3102 rc = cifs_retry_async_readv(rdata); 3280 rc = cifs_retry_async_readv(rdata);
3103 if (rc != 0) { 3281 if (rc != 0) {
3104 list_for_each_entry_safe(page, tpage, &rdata->pages, 3282 for (i = 0; i < rdata->nr_pages; i++) {
3105 lru) { 3283 page = rdata->pages[i];
3106 list_del(&page->lru);
3107 lru_cache_add_file(page); 3284 lru_cache_add_file(page);
3108 unlock_page(page); 3285 unlock_page(page);
3109 page_cache_release(page); 3286 page_cache_release(page);
@@ -3347,6 +3524,7 @@ void cifs_oplock_break(struct work_struct *work)
3347 oplock_break); 3524 oplock_break);
3348 struct inode *inode = cfile->dentry->d_inode; 3525 struct inode *inode = cfile->dentry->d_inode;
3349 struct cifsInodeInfo *cinode = CIFS_I(inode); 3526 struct cifsInodeInfo *cinode = CIFS_I(inode);
3527 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
3350 int rc = 0; 3528 int rc = 0;
3351 3529
3352 if (inode && S_ISREG(inode->i_mode)) { 3530 if (inode && S_ISREG(inode->i_mode)) {
@@ -3374,10 +3552,8 @@ void cifs_oplock_break(struct work_struct *work)
3374 * disconnected since oplock already released by the server 3552 * disconnected since oplock already released by the server
3375 */ 3553 */
3376 if (!cfile->oplock_break_cancelled) { 3554 if (!cfile->oplock_break_cancelled) {
3377 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 3555 rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
3378 current->tgid, 0, 0, 0, 0, 3556 cinode);
3379 LOCKING_ANDX_OPLOCK_RELEASE, false,
3380 cinode->clientCanCacheRead ? 1 : 0);
3381 cFYI(1, "Oplock release rc = %d", rc); 3557 cFYI(1, "Oplock release rc = %d", rc);
3382 } 3558 }
3383} 3559}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cb79c7edecb0..afdff79651f1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -282,7 +282,8 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
282 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 282 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
283} 283}
284 284
285int cifs_get_file_info_unix(struct file *filp) 285static int
286cifs_get_file_info_unix(struct file *filp)
286{ 287{
287 int rc; 288 int rc;
288 unsigned int xid; 289 unsigned int xid;
@@ -294,7 +295,7 @@ int cifs_get_file_info_unix(struct file *filp)
294 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 295 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
295 296
296 xid = get_xid(); 297 xid = get_xid();
297 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 298 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data);
298 if (!rc) { 299 if (!rc) {
299 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); 300 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
300 } else if (rc == -EREMOTE) { 301 } else if (rc == -EREMOTE) {
@@ -550,7 +551,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
550 fattr->cf_gid = cifs_sb->mnt_gid; 551 fattr->cf_gid = cifs_sb->mnt_gid;
551} 552}
552 553
553int cifs_get_file_info(struct file *filp) 554static int
555cifs_get_file_info(struct file *filp)
554{ 556{
555 int rc; 557 int rc;
556 unsigned int xid; 558 unsigned int xid;
@@ -560,9 +562,13 @@ int cifs_get_file_info(struct file *filp)
560 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 562 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
561 struct cifsFileInfo *cfile = filp->private_data; 563 struct cifsFileInfo *cfile = filp->private_data;
562 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 564 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
565 struct TCP_Server_Info *server = tcon->ses->server;
566
567 if (!server->ops->query_file_info)
568 return -ENOSYS;
563 569
564 xid = get_xid(); 570 xid = get_xid();
565 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 571 rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
566 switch (rc) { 572 switch (rc) {
567 case 0: 573 case 0:
568 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); 574 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
@@ -601,7 +607,9 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
601 FILE_ALL_INFO *data, struct super_block *sb, int xid, 607 FILE_ALL_INFO *data, struct super_block *sb, int xid,
602 const __u16 *fid) 608 const __u16 *fid)
603{ 609{
604 int rc = 0, tmprc; 610 bool validinum = false;
611 __u16 srchflgs;
612 int rc = 0, tmprc = ENOSYS;
605 struct cifs_tcon *tcon; 613 struct cifs_tcon *tcon;
606 struct TCP_Server_Info *server; 614 struct TCP_Server_Info *server;
607 struct tcon_link *tlink; 615 struct tcon_link *tlink;
@@ -609,6 +617,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
609 char *buf = NULL; 617 char *buf = NULL;
610 bool adjust_tz = false; 618 bool adjust_tz = false;
611 struct cifs_fattr fattr; 619 struct cifs_fattr fattr;
620 struct cifs_search_info *srchinf = NULL;
612 621
613 tlink = cifs_sb_tlink(cifs_sb); 622 tlink = cifs_sb_tlink(cifs_sb);
614 if (IS_ERR(tlink)) 623 if (IS_ERR(tlink))
@@ -647,9 +656,38 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
647 } else if (rc == -EREMOTE) { 656 } else if (rc == -EREMOTE) {
648 cifs_create_dfs_fattr(&fattr, sb); 657 cifs_create_dfs_fattr(&fattr, sb);
649 rc = 0; 658 rc = 0;
650 } else { 659 } else if (rc == -EACCES && backup_cred(cifs_sb)) {
660 srchinf = kzalloc(sizeof(struct cifs_search_info),
661 GFP_KERNEL);
662 if (srchinf == NULL) {
663 rc = -ENOMEM;
664 goto cgii_exit;
665 }
666
667 srchinf->endOfSearch = false;
668 srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
669
670 srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
671 CIFS_SEARCH_CLOSE_AT_END |
672 CIFS_SEARCH_BACKUP_SEARCH;
673
674 rc = CIFSFindFirst(xid, tcon, full_path,
675 cifs_sb, NULL, srchflgs, srchinf, false);
676 if (!rc) {
677 data =
678 (FILE_ALL_INFO *)srchinf->srch_entries_start;
679
680 cifs_dir_info_to_fattr(&fattr,
681 (FILE_DIRECTORY_INFO *)data, cifs_sb);
682 fattr.cf_uniqueid = le64_to_cpu(
683 ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
684 validinum = true;
685
686 cifs_buf_release(srchinf->ntwrk_buf_start);
687 }
688 kfree(srchinf);
689 } else
651 goto cgii_exit; 690 goto cgii_exit;
652 }
653 691
654 /* 692 /*
655 * If an inode wasn't passed in, then get the inode number 693 * If an inode wasn't passed in, then get the inode number
@@ -660,23 +698,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
660 */ 698 */
661 if (*inode == NULL) { 699 if (*inode == NULL) {
662 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 700 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
663 if (server->ops->get_srv_inum) 701 if (validinum == false) {
664 tmprc = server->ops->get_srv_inum(xid, tcon, 702 if (server->ops->get_srv_inum)
665 cifs_sb, full_path, &fattr.cf_uniqueid, 703 tmprc = server->ops->get_srv_inum(xid,
666 data); 704 tcon, cifs_sb, full_path,
667 else 705 &fattr.cf_uniqueid, data);
668 tmprc = -ENOSYS; 706 if (tmprc) {
669 if (tmprc || !fattr.cf_uniqueid) { 707 cFYI(1, "GetSrvInodeNum rc %d", tmprc);
670 cFYI(1, "GetSrvInodeNum rc %d", tmprc); 708 fattr.cf_uniqueid = iunique(sb, ROOT_I);
671 fattr.cf_uniqueid = iunique(sb, ROOT_I); 709 cifs_autodisable_serverino(cifs_sb);
672 cifs_autodisable_serverino(cifs_sb); 710 }
673 } 711 }
674 } else { 712 } else
675 fattr.cf_uniqueid = iunique(sb, ROOT_I); 713 fattr.cf_uniqueid = iunique(sb, ROOT_I);
676 } 714 } else
677 } else {
678 fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; 715 fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
679 }
680 716
681 /* query for SFU type info if supported and needed */ 717 /* query for SFU type info if supported and needed */
682 if (fattr.cf_cifsattrs & ATTR_SYSTEM && 718 if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
@@ -876,25 +912,22 @@ out:
876 return inode; 912 return inode;
877} 913}
878 914
879static int 915int
880cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, 916cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
881 char *full_path, __u32 dosattr) 917 char *full_path, __u32 dosattr)
882{ 918{
883 int rc;
884 int oplock = 0;
885 __u16 netfid;
886 __u32 netpid;
887 bool set_time = false; 919 bool set_time = false;
888 struct cifsFileInfo *open_file;
889 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
890 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 920 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
891 struct tcon_link *tlink = NULL; 921 struct TCP_Server_Info *server;
892 struct cifs_tcon *pTcon;
893 FILE_BASIC_INFO info_buf; 922 FILE_BASIC_INFO info_buf;
894 923
895 if (attrs == NULL) 924 if (attrs == NULL)
896 return -EINVAL; 925 return -EINVAL;
897 926
927 server = cifs_sb_master_tcon(cifs_sb)->ses->server;
928 if (!server->ops->set_file_info)
929 return -ENOSYS;
930
898 if (attrs->ia_valid & ATTR_ATIME) { 931 if (attrs->ia_valid & ATTR_ATIME) {
899 set_time = true; 932 set_time = true;
900 info_buf.LastAccessTime = 933 info_buf.LastAccessTime =
@@ -925,81 +958,17 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
925 info_buf.CreationTime = 0; /* don't change */ 958 info_buf.CreationTime = 0; /* don't change */
926 info_buf.Attributes = cpu_to_le32(dosattr); 959 info_buf.Attributes = cpu_to_le32(dosattr);
927 960
928 /* 961 return server->ops->set_file_info(inode, full_path, &info_buf, xid);
929 * If the file is already open for write, just use that fileid
930 */
931 open_file = find_writable_file(cifsInode, true);
932 if (open_file) {
933 netfid = open_file->netfid;
934 netpid = open_file->pid;
935 pTcon = tlink_tcon(open_file->tlink);
936 goto set_via_filehandle;
937 }
938
939 tlink = cifs_sb_tlink(cifs_sb);
940 if (IS_ERR(tlink)) {
941 rc = PTR_ERR(tlink);
942 tlink = NULL;
943 goto out;
944 }
945 pTcon = tlink_tcon(tlink);
946
947 /*
948 * NT4 apparently returns success on this call, but it doesn't
949 * really work.
950 */
951 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
952 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
953 &info_buf, cifs_sb->local_nls,
954 cifs_sb->mnt_cifs_flags &
955 CIFS_MOUNT_MAP_SPECIAL_CHR);
956 if (rc == 0) {
957 cifsInode->cifsAttrs = dosattr;
958 goto out;
959 } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
960 goto out;
961 }
962
963 cFYI(1, "calling SetFileInfo since SetPathInfo for "
964 "times not supported by this server");
965 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
966 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
967 CREATE_NOT_DIR, &netfid, &oplock,
968 NULL, cifs_sb->local_nls,
969 cifs_sb->mnt_cifs_flags &
970 CIFS_MOUNT_MAP_SPECIAL_CHR);
971
972 if (rc != 0) {
973 if (rc == -EIO)
974 rc = -EINVAL;
975 goto out;
976 }
977
978 netpid = current->tgid;
979
980set_via_filehandle:
981 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
982 if (!rc)
983 cifsInode->cifsAttrs = dosattr;
984
985 if (open_file == NULL)
986 CIFSSMBClose(xid, pTcon, netfid);
987 else
988 cifsFileInfo_put(open_file);
989out:
990 if (tlink != NULL)
991 cifs_put_tlink(tlink);
992 return rc;
993} 962}
994 963
995/* 964/*
996 * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit 965 * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
997 * and rename it to a random name that hopefully won't conflict with 966 * and rename it to a random name that hopefully won't conflict with
998 * anything else. 967 * anything else.
999 */ 968 */
1000static int 969int
1001cifs_rename_pending_delete(char *full_path, struct dentry *dentry, 970cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1002 unsigned int xid) 971 const unsigned int xid)
1003{ 972{
1004 int oplock = 0; 973 int oplock = 0;
1005 int rc; 974 int rc;
@@ -1136,6 +1105,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1136 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 1105 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
1137 struct tcon_link *tlink; 1106 struct tcon_link *tlink;
1138 struct cifs_tcon *tcon; 1107 struct cifs_tcon *tcon;
1108 struct TCP_Server_Info *server;
1139 struct iattr *attrs = NULL; 1109 struct iattr *attrs = NULL;
1140 __u32 dosattr = 0, origattr = 0; 1110 __u32 dosattr = 0, origattr = 0;
1141 1111
@@ -1145,6 +1115,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1145 if (IS_ERR(tlink)) 1115 if (IS_ERR(tlink))
1146 return PTR_ERR(tlink); 1116 return PTR_ERR(tlink);
1147 tcon = tlink_tcon(tlink); 1117 tcon = tlink_tcon(tlink);
1118 server = tcon->ses->server;
1148 1119
1149 xid = get_xid(); 1120 xid = get_xid();
1150 1121
@@ -1167,8 +1138,12 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1167 } 1138 }
1168 1139
1169retry_std_delete: 1140retry_std_delete:
1170 rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls, 1141 if (!server->ops->unlink) {
1171 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1142 rc = -ENOSYS;
1143 goto psx_del_no_retry;
1144 }
1145
1146 rc = server->ops->unlink(xid, tcon, full_path, cifs_sb);
1172 1147
1173psx_del_no_retry: 1148psx_del_no_retry:
1174 if (!rc) { 1149 if (!rc) {
@@ -1177,9 +1152,14 @@ psx_del_no_retry:
1177 } else if (rc == -ENOENT) { 1152 } else if (rc == -ENOENT) {
1178 d_drop(dentry); 1153 d_drop(dentry);
1179 } else if (rc == -ETXTBSY) { 1154 } else if (rc == -ETXTBSY) {
1180 rc = cifs_rename_pending_delete(full_path, dentry, xid); 1155 if (server->ops->rename_pending_delete) {
1181 if (rc == 0) 1156 rc = server->ops->rename_pending_delete(full_path,
1182 cifs_drop_nlink(inode); 1157 dentry, xid);
1158 if (rc == 0)
1159 cifs_drop_nlink(inode);
1160 }
1161 if (rc == -ETXTBSY)
1162 rc = -EBUSY;
1183 } else if ((rc == -EACCES) && (dosattr == 0) && inode) { 1163 } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
1184 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 1164 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
1185 if (attrs == NULL) { 1165 if (attrs == NULL) {
@@ -1227,34 +1207,33 @@ unlink_out:
1227} 1207}
1228 1208
1229static int 1209static int
1230cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode, 1210cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
1231 const char *full_path, struct cifs_sb_info *cifs_sb, 1211 const char *full_path, struct cifs_sb_info *cifs_sb,
1232 struct cifs_tcon *tcon, const unsigned int xid) 1212 struct cifs_tcon *tcon, const unsigned int xid)
1233{ 1213{
1234 int rc = 0; 1214 int rc = 0;
1235 struct inode *newinode = NULL; 1215 struct inode *inode = NULL;
1236 1216
1237 if (tcon->unix_ext) 1217 if (tcon->unix_ext)
1238 rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, 1218 rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
1239 xid); 1219 xid);
1240 else 1220 else
1241 rc = cifs_get_inode_info(&newinode, full_path, NULL, 1221 rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
1242 inode->i_sb, xid, NULL); 1222 xid, NULL);
1223
1243 if (rc) 1224 if (rc)
1244 return rc; 1225 return rc;
1245 1226
1246 d_instantiate(dentry, newinode);
1247 /* 1227 /*
1248 * setting nlink not necessary except in cases where we failed to get it 1228 * setting nlink not necessary except in cases where we failed to get it
1249 * from the server or was set bogus 1229 * from the server or was set bogus. Also, since this is a brand new
1230 * inode, no need to grab the i_lock before setting the i_nlink.
1250 */ 1231 */
1251 spin_lock(&dentry->d_inode->i_lock); 1232 if (inode->i_nlink < 2)
1252 if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2)) 1233 set_nlink(inode, 2);
1253 set_nlink(dentry->d_inode, 2);
1254 spin_unlock(&dentry->d_inode->i_lock);
1255 mode &= ~current_umask(); 1234 mode &= ~current_umask();
1256 /* must turn on setgid bit if parent dir has it */ 1235 /* must turn on setgid bit if parent dir has it */
1257 if (inode->i_mode & S_ISGID) 1236 if (parent->i_mode & S_ISGID)
1258 mode |= S_ISGID; 1237 mode |= S_ISGID;
1259 1238
1260 if (tcon->unix_ext) { 1239 if (tcon->unix_ext) {
@@ -1267,8 +1246,8 @@ cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode,
1267 }; 1246 };
1268 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 1247 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
1269 args.uid = (__u64)current_fsuid(); 1248 args.uid = (__u64)current_fsuid();
1270 if (inode->i_mode & S_ISGID) 1249 if (parent->i_mode & S_ISGID)
1271 args.gid = (__u64)inode->i_gid; 1250 args.gid = (__u64)parent->i_gid;
1272 else 1251 else
1273 args.gid = (__u64)current_fsgid(); 1252 args.gid = (__u64)current_fsgid();
1274 } else { 1253 } else {
@@ -1283,22 +1262,20 @@ cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode,
1283 struct TCP_Server_Info *server = tcon->ses->server; 1262 struct TCP_Server_Info *server = tcon->ses->server;
1284 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1263 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1285 (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) 1264 (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
1286 server->ops->mkdir_setinfo(newinode, full_path, cifs_sb, 1265 server->ops->mkdir_setinfo(inode, full_path, cifs_sb,
1287 tcon, xid); 1266 tcon, xid);
1288 if (dentry->d_inode) { 1267 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
1289 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) 1268 inode->i_mode = (mode | S_IFDIR);
1290 dentry->d_inode->i_mode = (mode | S_IFDIR); 1269
1291 1270 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
1292 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 1271 inode->i_uid = current_fsuid();
1293 dentry->d_inode->i_uid = current_fsuid(); 1272 if (inode->i_mode & S_ISGID)
1294 if (inode->i_mode & S_ISGID) 1273 inode->i_gid = parent->i_gid;
1295 dentry->d_inode->i_gid = inode->i_gid; 1274 else
1296 else 1275 inode->i_gid = current_fsgid();
1297 dentry->d_inode->i_gid =
1298 current_fsgid();
1299 }
1300 } 1276 }
1301 } 1277 }
1278 d_instantiate(dentry, inode);
1302 return rc; 1279 return rc;
1303} 1280}
1304 1281
@@ -1495,29 +1472,32 @@ rmdir_exit:
1495} 1472}
1496 1473
1497static int 1474static int
1498cifs_do_rename(unsigned int xid, struct dentry *from_dentry, 1475cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
1499 const char *fromPath, struct dentry *to_dentry, 1476 const char *from_path, struct dentry *to_dentry,
1500 const char *toPath) 1477 const char *to_path)
1501{ 1478{
1502 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); 1479 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1503 struct tcon_link *tlink; 1480 struct tcon_link *tlink;
1504 struct cifs_tcon *pTcon; 1481 struct cifs_tcon *tcon;
1482 struct TCP_Server_Info *server;
1505 __u16 srcfid; 1483 __u16 srcfid;
1506 int oplock, rc; 1484 int oplock, rc;
1507 1485
1508 tlink = cifs_sb_tlink(cifs_sb); 1486 tlink = cifs_sb_tlink(cifs_sb);
1509 if (IS_ERR(tlink)) 1487 if (IS_ERR(tlink))
1510 return PTR_ERR(tlink); 1488 return PTR_ERR(tlink);
1511 pTcon = tlink_tcon(tlink); 1489 tcon = tlink_tcon(tlink);
1490 server = tcon->ses->server;
1491
1492 if (!server->ops->rename)
1493 return -ENOSYS;
1512 1494
1513 /* try path-based rename first */ 1495 /* try path-based rename first */
1514 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, 1496 rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb);
1515 cifs_sb->mnt_cifs_flags &
1516 CIFS_MOUNT_MAP_SPECIAL_CHR);
1517 1497
1518 /* 1498 /*
1519 * don't bother with rename by filehandle unless file is busy and 1499 * Don't bother with rename by filehandle unless file is busy and
1520 * source Note that cross directory moves do not work with 1500 * source. Note that cross directory moves do not work with
1521 * rename by filehandle to various Windows servers. 1501 * rename by filehandle to various Windows servers.
1522 */ 1502 */
1523 if (rc == 0 || rc != -ETXTBSY) 1503 if (rc == 0 || rc != -ETXTBSY)
@@ -1528,29 +1508,28 @@ cifs_do_rename(unsigned int xid, struct dentry *from_dentry,
1528 goto do_rename_exit; 1508 goto do_rename_exit;
1529 1509
1530 /* open the file to be renamed -- we need DELETE perms */ 1510 /* open the file to be renamed -- we need DELETE perms */
1531 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1511 rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE,
1532 CREATE_NOT_DIR, &srcfid, &oplock, NULL, 1512 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
1533 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1513 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1534 CIFS_MOUNT_MAP_SPECIAL_CHR); 1514 CIFS_MOUNT_MAP_SPECIAL_CHR);
1535
1536 if (rc == 0) { 1515 if (rc == 0) {
1537 rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid, 1516 rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid,
1538 (const char *) to_dentry->d_name.name, 1517 (const char *) to_dentry->d_name.name,
1539 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1518 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1540 CIFS_MOUNT_MAP_SPECIAL_CHR); 1519 CIFS_MOUNT_MAP_SPECIAL_CHR);
1541 1520 CIFSSMBClose(xid, tcon, srcfid);
1542 CIFSSMBClose(xid, pTcon, srcfid);
1543 } 1521 }
1544do_rename_exit: 1522do_rename_exit:
1545 cifs_put_tlink(tlink); 1523 cifs_put_tlink(tlink);
1546 return rc; 1524 return rc;
1547} 1525}
1548 1526
1549int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, 1527int
1550 struct inode *target_dir, struct dentry *target_dentry) 1528cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1529 struct inode *target_dir, struct dentry *target_dentry)
1551{ 1530{
1552 char *fromName = NULL; 1531 char *from_name = NULL;
1553 char *toName = NULL; 1532 char *to_name = NULL;
1554 struct cifs_sb_info *cifs_sb; 1533 struct cifs_sb_info *cifs_sb;
1555 struct tcon_link *tlink; 1534 struct tcon_link *tlink;
1556 struct cifs_tcon *tcon; 1535 struct cifs_tcon *tcon;
@@ -1571,25 +1550,25 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1571 * we already have the rename sem so we do not need to 1550 * we already have the rename sem so we do not need to
1572 * grab it again here to protect the path integrity 1551 * grab it again here to protect the path integrity
1573 */ 1552 */
1574 fromName = build_path_from_dentry(source_dentry); 1553 from_name = build_path_from_dentry(source_dentry);
1575 if (fromName == NULL) { 1554 if (from_name == NULL) {
1576 rc = -ENOMEM; 1555 rc = -ENOMEM;
1577 goto cifs_rename_exit; 1556 goto cifs_rename_exit;
1578 } 1557 }
1579 1558
1580 toName = build_path_from_dentry(target_dentry); 1559 to_name = build_path_from_dentry(target_dentry);
1581 if (toName == NULL) { 1560 if (to_name == NULL) {
1582 rc = -ENOMEM; 1561 rc = -ENOMEM;
1583 goto cifs_rename_exit; 1562 goto cifs_rename_exit;
1584 } 1563 }
1585 1564
1586 rc = cifs_do_rename(xid, source_dentry, fromName, 1565 rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
1587 target_dentry, toName); 1566 to_name);
1588 1567
1589 if (rc == -EEXIST && tcon->unix_ext) { 1568 if (rc == -EEXIST && tcon->unix_ext) {
1590 /* 1569 /*
1591 * Are src and dst hardlinks of same inode? We can 1570 * Are src and dst hardlinks of same inode? We can only tell
1592 * only tell with unix extensions enabled 1571 * with unix extensions enabled.
1593 */ 1572 */
1594 info_buf_source = 1573 info_buf_source =
1595 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), 1574 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
@@ -1600,19 +1579,19 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1600 } 1579 }
1601 1580
1602 info_buf_target = info_buf_source + 1; 1581 info_buf_target = info_buf_source + 1;
1603 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, 1582 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name,
1604 info_buf_source, 1583 info_buf_source,
1605 cifs_sb->local_nls, 1584 cifs_sb->local_nls,
1606 cifs_sb->mnt_cifs_flags & 1585 cifs_sb->mnt_cifs_flags &
1607 CIFS_MOUNT_MAP_SPECIAL_CHR); 1586 CIFS_MOUNT_MAP_SPECIAL_CHR);
1608 if (tmprc != 0) 1587 if (tmprc != 0)
1609 goto unlink_target; 1588 goto unlink_target;
1610 1589
1611 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName, 1590 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name,
1612 info_buf_target, 1591 info_buf_target,
1613 cifs_sb->local_nls, 1592 cifs_sb->local_nls,
1614 cifs_sb->mnt_cifs_flags & 1593 cifs_sb->mnt_cifs_flags &
1615 CIFS_MOUNT_MAP_SPECIAL_CHR); 1594 CIFS_MOUNT_MAP_SPECIAL_CHR);
1616 1595
1617 if (tmprc == 0 && (info_buf_source->UniqueId == 1596 if (tmprc == 0 && (info_buf_source->UniqueId ==
1618 info_buf_target->UniqueId)) { 1597 info_buf_target->UniqueId)) {
@@ -1620,8 +1599,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1620 rc = 0; 1599 rc = 0;
1621 goto cifs_rename_exit; 1600 goto cifs_rename_exit;
1622 } 1601 }
1623 } /* else ... BB we could add the same check for Windows by 1602 }
1624 checking the UniqueId via FILE_INTERNAL_INFO */ 1603 /*
1604 * else ... BB we could add the same check for Windows by
1605 * checking the UniqueId via FILE_INTERNAL_INFO
1606 */
1625 1607
1626unlink_target: 1608unlink_target:
1627 /* Try unlinking the target dentry if it's not negative */ 1609 /* Try unlinking the target dentry if it's not negative */
@@ -1629,15 +1611,14 @@ unlink_target:
1629 tmprc = cifs_unlink(target_dir, target_dentry); 1611 tmprc = cifs_unlink(target_dir, target_dentry);
1630 if (tmprc) 1612 if (tmprc)
1631 goto cifs_rename_exit; 1613 goto cifs_rename_exit;
1632 1614 rc = cifs_do_rename(xid, source_dentry, from_name,
1633 rc = cifs_do_rename(xid, source_dentry, fromName, 1615 target_dentry, to_name);
1634 target_dentry, toName);
1635 } 1616 }
1636 1617
1637cifs_rename_exit: 1618cifs_rename_exit:
1638 kfree(info_buf_source); 1619 kfree(info_buf_source);
1639 kfree(fromName); 1620 kfree(from_name);
1640 kfree(toName); 1621 kfree(to_name);
1641 free_xid(xid); 1622 free_xid(xid);
1642 cifs_put_tlink(tlink); 1623 cifs_put_tlink(tlink);
1643 return rc; 1624 return rc;
@@ -1862,7 +1843,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1862 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1843 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1863 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1844 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1864 struct tcon_link *tlink = NULL; 1845 struct tcon_link *tlink = NULL;
1865 struct cifs_tcon *pTcon = NULL; 1846 struct cifs_tcon *tcon = NULL;
1847 struct TCP_Server_Info *server;
1866 struct cifs_io_parms io_parms; 1848 struct cifs_io_parms io_parms;
1867 1849
1868 /* 1850 /*
@@ -1876,19 +1858,21 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1876 */ 1858 */
1877 open_file = find_writable_file(cifsInode, true); 1859 open_file = find_writable_file(cifsInode, true);
1878 if (open_file) { 1860 if (open_file) {
1879 __u16 nfid = open_file->netfid; 1861 tcon = tlink_tcon(open_file->tlink);
1880 __u32 npid = open_file->pid; 1862 server = tcon->ses->server;
1881 pTcon = tlink_tcon(open_file->tlink); 1863 if (server->ops->set_file_size)
1882 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1864 rc = server->ops->set_file_size(xid, tcon, open_file,
1883 npid, false); 1865 attrs->ia_size, false);
1866 else
1867 rc = -ENOSYS;
1884 cifsFileInfo_put(open_file); 1868 cifsFileInfo_put(open_file);
1885 cFYI(1, "SetFSize for attrs rc = %d", rc); 1869 cFYI(1, "SetFSize for attrs rc = %d", rc);
1886 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1870 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1887 unsigned int bytes_written; 1871 unsigned int bytes_written;
1888 1872
1889 io_parms.netfid = nfid; 1873 io_parms.netfid = open_file->fid.netfid;
1890 io_parms.pid = npid; 1874 io_parms.pid = open_file->pid;
1891 io_parms.tcon = pTcon; 1875 io_parms.tcon = tcon;
1892 io_parms.offset = 0; 1876 io_parms.offset = 0;
1893 io_parms.length = attrs->ia_size; 1877 io_parms.length = attrs->ia_size;
1894 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, 1878 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
@@ -1898,52 +1882,55 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1898 } else 1882 } else
1899 rc = -EINVAL; 1883 rc = -EINVAL;
1900 1884
1901 if (rc != 0) { 1885 if (!rc)
1902 if (pTcon == NULL) { 1886 goto set_size_out;
1903 tlink = cifs_sb_tlink(cifs_sb);
1904 if (IS_ERR(tlink))
1905 return PTR_ERR(tlink);
1906 pTcon = tlink_tcon(tlink);
1907 }
1908 1887
1909 /* Set file size by pathname rather than by handle 1888 if (tcon == NULL) {
1910 either because no valid, writeable file handle for 1889 tlink = cifs_sb_tlink(cifs_sb);
1911 it was found or because there was an error setting 1890 if (IS_ERR(tlink))
1912 it by handle */ 1891 return PTR_ERR(tlink);
1913 rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, 1892 tcon = tlink_tcon(tlink);
1914 false, cifs_sb->local_nls, 1893 server = tcon->ses->server;
1894 }
1895
1896 /*
1897 * Set file size by pathname rather than by handle either because no
1898 * valid, writeable file handle for it was found or because there was
1899 * an error setting it by handle.
1900 */
1901 if (server->ops->set_path_size)
1902 rc = server->ops->set_path_size(xid, tcon, full_path,
1903 attrs->ia_size, cifs_sb, false);
1904 else
1905 rc = -ENOSYS;
1906 cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
1907 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1908 __u16 netfid;
1909 int oplock = 0;
1910
1911 rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
1912 GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
1913 &oplock, NULL, cifs_sb->local_nls,
1915 cifs_sb->mnt_cifs_flags & 1914 cifs_sb->mnt_cifs_flags &
1916 CIFS_MOUNT_MAP_SPECIAL_CHR); 1915 CIFS_MOUNT_MAP_SPECIAL_CHR);
1917 cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); 1916 if (rc == 0) {
1918 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1917 unsigned int bytes_written;
1919 __u16 netfid; 1918
1920 int oplock = 0; 1919 io_parms.netfid = netfid;
1921 1920 io_parms.pid = current->tgid;
1922 rc = SMBLegacyOpen(xid, pTcon, full_path, 1921 io_parms.tcon = tcon;
1923 FILE_OPEN, GENERIC_WRITE, 1922 io_parms.offset = 0;
1924 CREATE_NOT_DIR, &netfid, &oplock, NULL, 1923 io_parms.length = attrs->ia_size;
1925 cifs_sb->local_nls, 1924 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
1926 cifs_sb->mnt_cifs_flags & 1925 NULL, 1);
1927 CIFS_MOUNT_MAP_SPECIAL_CHR); 1926 cFYI(1, "wrt seteof rc %d", rc);
1928 if (rc == 0) { 1927 CIFSSMBClose(xid, tcon, netfid);
1929 unsigned int bytes_written;
1930
1931 io_parms.netfid = netfid;
1932 io_parms.pid = current->tgid;
1933 io_parms.tcon = pTcon;
1934 io_parms.offset = 0;
1935 io_parms.length = attrs->ia_size;
1936 rc = CIFSSMBWrite(xid, &io_parms,
1937 &bytes_written,
1938 NULL, NULL, 1);
1939 cFYI(1, "wrt seteof rc %d", rc);
1940 CIFSSMBClose(xid, pTcon, netfid);
1941 }
1942 } 1928 }
1943 if (tlink)
1944 cifs_put_tlink(tlink);
1945 } 1929 }
1930 if (tlink)
1931 cifs_put_tlink(tlink);
1946 1932
1933set_size_out:
1947 if (rc == 0) { 1934 if (rc == 0) {
1948 cifsInode->server_eof = attrs->ia_size; 1935 cifsInode->server_eof = attrs->ia_size;
1949 cifs_setsize(inode, attrs->ia_size); 1936 cifs_setsize(inode, attrs->ia_size);
@@ -2050,7 +2037,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
2050 args->device = 0; 2037 args->device = 0;
2051 open_file = find_writable_file(cifsInode, true); 2038 open_file = find_writable_file(cifsInode, true);
2052 if (open_file) { 2039 if (open_file) {
2053 u16 nfid = open_file->netfid; 2040 u16 nfid = open_file->fid.netfid;
2054 u32 npid = open_file->pid; 2041 u32 npid = open_file->pid;
2055 pTcon = tlink_tcon(open_file->tlink); 2042 pTcon = tlink_tcon(open_file->tlink);
2056 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 2043 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index ae082a66de2f..fd5009d56f9f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -28,8 +28,6 @@
28#include "cifs_debug.h" 28#include "cifs_debug.h"
29#include "cifsfs.h" 29#include "cifsfs.h"
30 30
31#define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
32
33long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) 31long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
34{ 32{
35 struct inode *inode = filep->f_dentry->d_inode; 33 struct inode *inode = filep->f_dentry->d_inode;
@@ -51,23 +49,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
51 cifs_sb = CIFS_SB(inode->i_sb); 49 cifs_sb = CIFS_SB(inode->i_sb);
52 50
53 switch (command) { 51 switch (command) {
54 static bool warned = false;
55 case CIFS_IOC_CHECKUMOUNT:
56 if (!warned) {
57 warned = true;
58 cERROR(1, "the CIFS_IOC_CHECKMOUNT ioctl will "
59 "be deprecated in 3.7. Please "
60 "migrate away from the use of "
61 "umount.cifs");
62 }
63 cFYI(1, "User unmount attempted");
64 if (cifs_sb->mnt_uid == current_uid())
65 rc = 0;
66 else {
67 rc = -EACCES;
68 cFYI(1, "uids do not match");
69 }
70 break;
71#ifdef CONFIG_CIFS_POSIX 52#ifdef CONFIG_CIFS_POSIX
72 case FS_IOC_GETFLAGS: 53 case FS_IOC_GETFLAGS:
73 if (pSMBFile == NULL) 54 if (pSMBFile == NULL)
@@ -75,8 +56,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
75 tcon = tlink_tcon(pSMBFile->tlink); 56 tcon = tlink_tcon(pSMBFile->tlink);
76 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); 57 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
77 if (CIFS_UNIX_EXTATTR_CAP & caps) { 58 if (CIFS_UNIX_EXTATTR_CAP & caps) {
78 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, 59 rc = CIFSGetExtAttr(xid, tcon,
79 &ExtAttrBits, &ExtAttrMask); 60 pSMBFile->fid.netfid,
61 &ExtAttrBits, &ExtAttrMask);
80 if (rc == 0) 62 if (rc == 0)
81 rc = put_user(ExtAttrBits & 63 rc = put_user(ExtAttrBits &
82 FS_FL_USER_VISIBLE, 64 FS_FL_USER_VISIBLE,
@@ -94,8 +76,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
94 rc = -EFAULT; 76 rc = -EFAULT;
95 break; 77 break;
96 } 78 }
97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 79 /*
98 extAttrBits, &ExtAttrMask);*/ 80 * rc = CIFSGetExtAttr(xid, tcon,
81 * pSMBFile->fid.netfid,
82 * extAttrBits,
83 * &ExtAttrMask);
84 */
99 } 85 }
100 cFYI(1, "set flags not implemented yet"); 86 cFYI(1, "set flags not implemented yet");
101 break; 87 break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e6ce3b112875..51dc2fb6e854 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -391,72 +391,86 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
391{ 391{
392 int rc = -EACCES; 392 int rc = -EACCES;
393 unsigned int xid; 393 unsigned int xid;
394 char *fromName = NULL; 394 char *from_name = NULL;
395 char *toName = NULL; 395 char *to_name = NULL;
396 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 396 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
397 struct tcon_link *tlink; 397 struct tcon_link *tlink;
398 struct cifs_tcon *pTcon; 398 struct cifs_tcon *tcon;
399 struct TCP_Server_Info *server;
399 struct cifsInodeInfo *cifsInode; 400 struct cifsInodeInfo *cifsInode;
400 401
401 tlink = cifs_sb_tlink(cifs_sb); 402 tlink = cifs_sb_tlink(cifs_sb);
402 if (IS_ERR(tlink)) 403 if (IS_ERR(tlink))
403 return PTR_ERR(tlink); 404 return PTR_ERR(tlink);
404 pTcon = tlink_tcon(tlink); 405 tcon = tlink_tcon(tlink);
405 406
406 xid = get_xid(); 407 xid = get_xid();
407 408
408 fromName = build_path_from_dentry(old_file); 409 from_name = build_path_from_dentry(old_file);
409 toName = build_path_from_dentry(direntry); 410 to_name = build_path_from_dentry(direntry);
410 if ((fromName == NULL) || (toName == NULL)) { 411 if ((from_name == NULL) || (to_name == NULL)) {
411 rc = -ENOMEM; 412 rc = -ENOMEM;
412 goto cifs_hl_exit; 413 goto cifs_hl_exit;
413 } 414 }
414 415
415 if (pTcon->unix_ext) 416 if (tcon->unix_ext)
416 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, 417 rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
417 cifs_sb->local_nls, 418 cifs_sb->local_nls,
418 cifs_sb->mnt_cifs_flags & 419 cifs_sb->mnt_cifs_flags &
419 CIFS_MOUNT_MAP_SPECIAL_CHR); 420 CIFS_MOUNT_MAP_SPECIAL_CHR);
420 else { 421 else {
421 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, 422 server = tcon->ses->server;
422 cifs_sb->local_nls, 423 if (!server->ops->create_hardlink)
423 cifs_sb->mnt_cifs_flags & 424 return -ENOSYS;
424 CIFS_MOUNT_MAP_SPECIAL_CHR); 425 rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
426 cifs_sb);
425 if ((rc == -EIO) || (rc == -EINVAL)) 427 if ((rc == -EIO) || (rc == -EINVAL))
426 rc = -EOPNOTSUPP; 428 rc = -EOPNOTSUPP;
427 } 429 }
428 430
429 d_drop(direntry); /* force new lookup from server of target */ 431 d_drop(direntry); /* force new lookup from server of target */
430 432
431 /* if source file is cached (oplocked) revalidate will not go to server 433 /*
432 until the file is closed or oplock broken so update nlinks locally */ 434 * if source file is cached (oplocked) revalidate will not go to server
435 * until the file is closed or oplock broken so update nlinks locally
436 */
433 if (old_file->d_inode) { 437 if (old_file->d_inode) {
434 cifsInode = CIFS_I(old_file->d_inode); 438 cifsInode = CIFS_I(old_file->d_inode);
435 if (rc == 0) { 439 if (rc == 0) {
436 spin_lock(&old_file->d_inode->i_lock); 440 spin_lock(&old_file->d_inode->i_lock);
437 inc_nlink(old_file->d_inode); 441 inc_nlink(old_file->d_inode);
438 spin_unlock(&old_file->d_inode->i_lock); 442 spin_unlock(&old_file->d_inode->i_lock);
439/* BB should we make this contingent on superblock flag NOATIME? */ 443 /*
440/* old_file->d_inode->i_ctime = CURRENT_TIME;*/ 444 * BB should we make this contingent on superblock flag
441 /* parent dir timestamps will update from srv 445 * NOATIME?
442 within a second, would it really be worth it 446 */
443 to set the parent dir cifs inode time to zero 447 /* old_file->d_inode->i_ctime = CURRENT_TIME; */
444 to force revalidate (faster) for it too? */ 448 /*
449 * parent dir timestamps will update from srv within a
450 * second, would it really be worth it to set the parent
451 * dir cifs inode time to zero to force revalidate
452 * (faster) for it too?
453 */
445 } 454 }
446 /* if not oplocked will force revalidate to get info 455 /*
447 on source file from srv */ 456 * if not oplocked will force revalidate to get info on source
457 * file from srv
458 */
448 cifsInode->time = 0; 459 cifsInode->time = 0;
449 460
450 /* Will update parent dir timestamps from srv within a second. 461 /*
451 Would it really be worth it to set the parent dir (cifs 462 * Will update parent dir timestamps from srv within a second.
452 inode) time field to zero to force revalidate on parent 463 * Would it really be worth it to set the parent dir (cifs
453 directory faster ie 464 * inode) time field to zero to force revalidate on parent
454 CIFS_I(inode)->time = 0; */ 465 * directory faster ie
466 *
467 * CIFS_I(inode)->time = 0;
468 */
455 } 469 }
456 470
457cifs_hl_exit: 471cifs_hl_exit:
458 kfree(fromName); 472 kfree(from_name);
459 kfree(toName); 473 kfree(to_name);
460 free_xid(xid); 474 free_xid(xid);
461 cifs_put_tlink(tlink); 475 cifs_put_tlink(tlink);
462 return rc; 476 return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index ce41fee07e5b..3a00c0d0cead 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -466,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
466 list_for_each(tmp2, &tcon->openFileList) { 466 list_for_each(tmp2, &tcon->openFileList) {
467 netfile = list_entry(tmp2, struct cifsFileInfo, 467 netfile = list_entry(tmp2, struct cifsFileInfo,
468 tlist); 468 tlist);
469 if (pSMB->Fid != netfile->netfid) 469 if (pSMB->Fid != netfile->fid.netfid)
470 continue; 470 continue;
471 471
472 cFYI(1, "file id match, oplock break"); 472 cFYI(1, "file id match, oplock break");
@@ -579,3 +579,33 @@ backup_cred(struct cifs_sb_info *cifs_sb)
579 579
580 return false; 580 return false;
581} 581}
582
583void
584cifs_del_pending_open(struct cifs_pending_open *open)
585{
586 spin_lock(&cifs_file_list_lock);
587 list_del(&open->olist);
588 spin_unlock(&cifs_file_list_lock);
589}
590
591void
592cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink,
593 struct cifs_pending_open *open)
594{
595#ifdef CONFIG_CIFS_SMB2
596 memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
597#endif
598 open->oplock = CIFS_OPLOCK_NO_CHANGE;
599 open->tlink = tlink;
600 fid->pending_open = open;
601 list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens);
602}
603
604void
605cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
606 struct cifs_pending_open *open)
607{
608 spin_lock(&cifs_file_list_lock);
609 cifs_add_pending_open_locked(fid, tlink, open);
610 spin_unlock(&cifs_file_list_lock);
611}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 581c225f7f50..d5ce9e26696c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -110,7 +110,7 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
110 {ERRnoroom, -ENOSPC}, 110 {ERRnoroom, -ENOSPC},
111 {ERRrmuns, -EUSERS}, 111 {ERRrmuns, -EUSERS},
112 {ERRtimeout, -ETIME}, 112 {ERRtimeout, -ETIME},
113 {ERRnoresource, -ENOBUFS}, 113 {ERRnoresource, -EREMOTEIO},
114 {ERRtoomanyuids, -EUSERS}, 114 {ERRtoomanyuids, -EUSERS},
115 {ERRbaduid, -EACCES}, 115 {ERRbaduid, -EACCES},
116 {ERRusempx, -EIO}, 116 {ERRusempx, -EIO},
@@ -412,7 +412,7 @@ static const struct {
412 from NT_STATUS_INSUFFICIENT_RESOURCES to 412 from NT_STATUS_INSUFFICIENT_RESOURCES to
413 NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ 413 NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */
414 { 414 {
415 ERRDOS, ERRnomem, NT_STATUS_INSUFFICIENT_RESOURCES}, { 415 ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, {
416 ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { 416 ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, {
417 ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { 417 ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, {
418 ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { 418 ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, {
@@ -682,7 +682,7 @@ static const struct {
682 ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { 682 ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, {
683 ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { 683 ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, {
684 ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { 684 ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, {
685 ERRDOS, ERRnomem, NT_STATUS_INSUFF_SERVER_RESOURCES}, { 685 ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, {
686 ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { 686 ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, {
687 ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { 687 ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, {
688 ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { 688 ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, {
@@ -913,8 +913,9 @@ map_smb_to_linux_error(char *buf, bool logErr)
913 * portion, the number of word parameters and the data portion of the message 913 * portion, the number of word parameters and the data portion of the message
914 */ 914 */
915unsigned int 915unsigned int
916smbCalcSize(struct smb_hdr *ptr) 916smbCalcSize(void *buf)
917{ 917{
918 struct smb_hdr *ptr = (struct smb_hdr *)buf;
918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 919 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
919 2 /* size of the bcc field */ + get_bcc(ptr)); 920 2 /* size of the bcc field */ + get_bcc(ptr));
920} 921}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d87f82678bc7..f9b5d3d6cf33 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -151,7 +151,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
151 } 151 }
152} 152}
153 153
154static void 154void
155cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, 155cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
156 struct cifs_sb_info *cifs_sb) 156 struct cifs_sb_info *cifs_sb)
157{ 157{
@@ -220,7 +220,8 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
220} 220}
221 */ 221 */
222 222
223static int initiate_cifs_search(const unsigned int xid, struct file *file) 223static int
224initiate_cifs_search(const unsigned int xid, struct file *file)
224{ 225{
225 __u16 search_flags; 226 __u16 search_flags;
226 int rc = 0; 227 int rc = 0;
@@ -229,6 +230,7 @@ static int initiate_cifs_search(const unsigned int xid, struct file *file)
229 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 230 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
230 struct tcon_link *tlink = NULL; 231 struct tcon_link *tlink = NULL;
231 struct cifs_tcon *tcon; 232 struct cifs_tcon *tcon;
233 struct TCP_Server_Info *server;
232 234
233 if (file->private_data == NULL) { 235 if (file->private_data == NULL) {
234 tlink = cifs_sb_tlink(cifs_sb); 236 tlink = cifs_sb_tlink(cifs_sb);
@@ -248,6 +250,13 @@ static int initiate_cifs_search(const unsigned int xid, struct file *file)
248 tcon = tlink_tcon(cifsFile->tlink); 250 tcon = tlink_tcon(cifsFile->tlink);
249 } 251 }
250 252
253 server = tcon->ses->server;
254
255 if (!server->ops->query_dir_first) {
256 rc = -ENOSYS;
257 goto error_exit;
258 }
259
251 cifsFile->invalidHandle = true; 260 cifsFile->invalidHandle = true;
252 cifsFile->srch_inf.endOfSearch = false; 261 cifsFile->srch_inf.endOfSearch = false;
253 262
@@ -278,10 +287,10 @@ ffirst_retry:
278 if (backup_cred(cifs_sb)) 287 if (backup_cred(cifs_sb))
279 search_flags |= CIFS_SEARCH_BACKUP_SEARCH; 288 search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
280 289
281 rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb->local_nls, 290 rc = server->ops->query_dir_first(xid, tcon, full_path, cifs_sb,
282 &cifsFile->netfid, search_flags, &cifsFile->srch_inf, 291 &cifsFile->fid, search_flags,
283 cifs_sb->mnt_cifs_flags & 292 &cifsFile->srch_inf);
284 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); 293
285 if (rc == 0) 294 if (rc == 0)
286 cifsFile->invalidHandle = false; 295 cifsFile->invalidHandle = false;
287 /* BB add following call to handle readdir on new NTFS symlink errors 296 /* BB add following call to handle readdir on new NTFS symlink errors
@@ -501,62 +510,67 @@ static int cifs_save_resume_key(const char *current_entry,
501 return rc; 510 return rc;
502} 511}
503 512
504/* find the corresponding entry in the search */ 513/*
505/* Note that the SMB server returns search entries for . and .. which 514 * Find the corresponding entry in the search. Note that the SMB server returns
506 complicates logic here if we choose to parse for them and we do not 515 * search entries for . and .. which complicates logic here if we choose to
507 assume that they are located in the findfirst return buffer.*/ 516 * parse for them and we do not assume that they are located in the findfirst
508/* We start counting in the buffer with entry 2 and increment for every 517 * return buffer. We start counting in the buffer with entry 2 and increment for
509 entry (do not increment for . or .. entry) */ 518 * every entry (do not increment for . or .. entry).
510static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *pTcon, 519 */
511 struct file *file, char **ppCurrentEntry, int *num_to_ret) 520static int
521find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
522 struct file *file, char **current_entry, int *num_to_ret)
512{ 523{
513 __u16 search_flags; 524 __u16 search_flags;
514 int rc = 0; 525 int rc = 0;
515 int pos_in_buf = 0; 526 int pos_in_buf = 0;
516 loff_t first_entry_in_buffer; 527 loff_t first_entry_in_buffer;
517 loff_t index_to_find = file->f_pos; 528 loff_t index_to_find = file->f_pos;
518 struct cifsFileInfo *cifsFile = file->private_data; 529 struct cifsFileInfo *cfile = file->private_data;
519 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 530 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
531 struct TCP_Server_Info *server = tcon->ses->server;
520 /* check if index in the buffer */ 532 /* check if index in the buffer */
521 533
522 if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || 534 if (!server->ops->query_dir_first || !server->ops->query_dir_next)
523 (num_to_ret == NULL)) 535 return -ENOSYS;
536
537 if ((cfile == NULL) || (current_entry == NULL) || (num_to_ret == NULL))
524 return -ENOENT; 538 return -ENOENT;
525 539
526 *ppCurrentEntry = NULL; 540 *current_entry = NULL;
527 first_entry_in_buffer = 541 first_entry_in_buffer = cfile->srch_inf.index_of_last_entry -
528 cifsFile->srch_inf.index_of_last_entry - 542 cfile->srch_inf.entries_in_buffer;
529 cifsFile->srch_inf.entries_in_buffer;
530 543
531 /* if first entry in buf is zero then is first buffer 544 /*
532 in search response data which means it is likely . and .. 545 * If first entry in buf is zero then is first buffer
533 will be in this buffer, although some servers do not return 546 * in search response data which means it is likely . and ..
534 . and .. for the root of a drive and for those we need 547 * will be in this buffer, although some servers do not return
535 to start two entries earlier */ 548 * . and .. for the root of a drive and for those we need
549 * to start two entries earlier.
550 */
536 551
537 dump_cifs_file_struct(file, "In fce "); 552 dump_cifs_file_struct(file, "In fce ");
538 if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 553 if (((index_to_find < cfile->srch_inf.index_of_last_entry) &&
539 is_dir_changed(file)) || 554 is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
540 (index_to_find < first_entry_in_buffer)) {
541 /* close and restart search */ 555 /* close and restart search */
542 cFYI(1, "search backing up - close and restart search"); 556 cFYI(1, "search backing up - close and restart search");
543 spin_lock(&cifs_file_list_lock); 557 spin_lock(&cifs_file_list_lock);
544 if (!cifsFile->srch_inf.endOfSearch && 558 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
545 !cifsFile->invalidHandle) { 559 cfile->invalidHandle = true;
546 cifsFile->invalidHandle = true;
547 spin_unlock(&cifs_file_list_lock); 560 spin_unlock(&cifs_file_list_lock);
548 CIFSFindClose(xid, pTcon, cifsFile->netfid); 561 if (server->ops->close)
562 server->ops->close(xid, tcon, &cfile->fid);
549 } else 563 } else
550 spin_unlock(&cifs_file_list_lock); 564 spin_unlock(&cifs_file_list_lock);
551 if (cifsFile->srch_inf.ntwrk_buf_start) { 565 if (cfile->srch_inf.ntwrk_buf_start) {
552 cFYI(1, "freeing SMB ff cache buf on search rewind"); 566 cFYI(1, "freeing SMB ff cache buf on search rewind");
553 if (cifsFile->srch_inf.smallBuf) 567 if (cfile->srch_inf.smallBuf)
554 cifs_small_buf_release(cifsFile->srch_inf. 568 cifs_small_buf_release(cfile->srch_inf.
555 ntwrk_buf_start); 569 ntwrk_buf_start);
556 else 570 else
557 cifs_buf_release(cifsFile->srch_inf. 571 cifs_buf_release(cfile->srch_inf.
558 ntwrk_buf_start); 572 ntwrk_buf_start);
559 cifsFile->srch_inf.ntwrk_buf_start = NULL; 573 cfile->srch_inf.ntwrk_buf_start = NULL;
560 } 574 }
561 rc = initiate_cifs_search(xid, file); 575 rc = initiate_cifs_search(xid, file);
562 if (rc) { 576 if (rc) {
@@ -565,65 +579,64 @@ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *pTcon,
565 return rc; 579 return rc;
566 } 580 }
567 /* FindFirst/Next set last_entry to NULL on malformed reply */ 581 /* FindFirst/Next set last_entry to NULL on malformed reply */
568 if (cifsFile->srch_inf.last_entry) 582 if (cfile->srch_inf.last_entry)
569 cifs_save_resume_key(cifsFile->srch_inf.last_entry, 583 cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
570 cifsFile);
571 } 584 }
572 585
573 search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; 586 search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
574 if (backup_cred(cifs_sb)) 587 if (backup_cred(cifs_sb))
575 search_flags |= CIFS_SEARCH_BACKUP_SEARCH; 588 search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
576 589
577 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 590 while ((index_to_find >= cfile->srch_inf.index_of_last_entry) &&
578 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 591 (rc == 0) && !cfile->srch_inf.endOfSearch) {
579 cFYI(1, "calling findnext2"); 592 cFYI(1, "calling findnext2");
580 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, search_flags, 593 rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
581 &cifsFile->srch_inf); 594 search_flags,
595 &cfile->srch_inf);
582 /* FindFirst/Next set last_entry to NULL on malformed reply */ 596 /* FindFirst/Next set last_entry to NULL on malformed reply */
583 if (cifsFile->srch_inf.last_entry) 597 if (cfile->srch_inf.last_entry)
584 cifs_save_resume_key(cifsFile->srch_inf.last_entry, 598 cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
585 cifsFile);
586 if (rc) 599 if (rc)
587 return -ENOENT; 600 return -ENOENT;
588 } 601 }
589 if (index_to_find < cifsFile->srch_inf.index_of_last_entry) { 602 if (index_to_find < cfile->srch_inf.index_of_last_entry) {
590 /* we found the buffer that contains the entry */ 603 /* we found the buffer that contains the entry */
591 /* scan and find it */ 604 /* scan and find it */
592 int i; 605 int i;
593 char *current_entry; 606 char *cur_ent;
594 char *end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + 607 char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
595 smbCalcSize((struct smb_hdr *) 608 server->ops->calc_smb_size(
596 cifsFile->srch_inf.ntwrk_buf_start); 609 cfile->srch_inf.ntwrk_buf_start);
597 610
598 current_entry = cifsFile->srch_inf.srch_entries_start; 611 cur_ent = cfile->srch_inf.srch_entries_start;
599 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry 612 first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
600 - cifsFile->srch_inf.entries_in_buffer; 613 - cfile->srch_inf.entries_in_buffer;
601 pos_in_buf = index_to_find - first_entry_in_buffer; 614 pos_in_buf = index_to_find - first_entry_in_buffer;
602 cFYI(1, "found entry - pos_in_buf %d", pos_in_buf); 615 cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
603 616
604 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { 617 for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) {
605 /* go entry by entry figuring out which is first */ 618 /* go entry by entry figuring out which is first */
606 current_entry = nxt_dir_entry(current_entry, end_of_smb, 619 cur_ent = nxt_dir_entry(cur_ent, end_of_smb,
607 cifsFile->srch_inf.info_level); 620 cfile->srch_inf.info_level);
608 } 621 }
609 if ((current_entry == NULL) && (i < pos_in_buf)) { 622 if ((cur_ent == NULL) && (i < pos_in_buf)) {
610 /* BB fixme - check if we should flag this error */ 623 /* BB fixme - check if we should flag this error */
611 cERROR(1, "reached end of buf searching for pos in buf" 624 cERROR(1, "reached end of buf searching for pos in buf"
612 " %d index to find %lld rc %d", 625 " %d index to find %lld rc %d", pos_in_buf,
613 pos_in_buf, index_to_find, rc); 626 index_to_find, rc);
614 } 627 }
615 rc = 0; 628 rc = 0;
616 *ppCurrentEntry = current_entry; 629 *current_entry = cur_ent;
617 } else { 630 } else {
618 cFYI(1, "index not in buffer - could not findnext into it"); 631 cFYI(1, "index not in buffer - could not findnext into it");
619 return 0; 632 return 0;
620 } 633 }
621 634
622 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { 635 if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) {
623 cFYI(1, "can not return entries pos_in_buf beyond last"); 636 cFYI(1, "can not return entries pos_in_buf beyond last");
624 *num_to_ret = 0; 637 *num_to_ret = 0;
625 } else 638 } else
626 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; 639 *num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf;
627 640
628 return rc; 641 return rc;
629} 642}
@@ -723,7 +736,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
723 int rc = 0; 736 int rc = 0;
724 unsigned int xid; 737 unsigned int xid;
725 int i; 738 int i;
726 struct cifs_tcon *pTcon; 739 struct cifs_tcon *tcon;
727 struct cifsFileInfo *cifsFile = NULL; 740 struct cifsFileInfo *cifsFile = NULL;
728 char *current_entry; 741 char *current_entry;
729 int num_to_fill = 0; 742 int num_to_fill = 0;
@@ -781,12 +794,12 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
781 } 794 }
782 } /* else { 795 } /* else {
783 cifsFile->invalidHandle = true; 796 cifsFile->invalidHandle = true;
784 CIFSFindClose(xid, pTcon, cifsFile->netfid); 797 tcon->ses->server->close(xid, tcon, &cifsFile->fid);
785 } */ 798 } */
786 799
787 pTcon = tlink_tcon(cifsFile->tlink); 800 tcon = tlink_tcon(cifsFile->tlink);
788 rc = find_cifs_entry(xid, pTcon, file, 801 rc = find_cifs_entry(xid, tcon, file, &current_entry,
789 &current_entry, &num_to_fill); 802 &num_to_fill);
790 if (rc) { 803 if (rc) {
791 cFYI(1, "fce error %d", rc); 804 cFYI(1, "fce error %d", rc);
792 goto rddir2_exit; 805 goto rddir2_exit;
@@ -798,7 +811,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
798 } 811 }
799 cFYI(1, "loop through %d times filling dir for net buf %p", 812 cFYI(1, "loop through %d times filling dir for net buf %p",
800 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); 813 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
801 max_len = smbCalcSize((struct smb_hdr *) 814 max_len = tcon->ses->server->ops->calc_smb_size(
802 cifsFile->srch_inf.ntwrk_buf_start); 815 cifsFile->srch_inf.ntwrk_buf_start);
803 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 816 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
804 817
@@ -815,10 +828,12 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
815 num_to_fill, i); 828 num_to_fill, i);
816 break; 829 break;
817 } 830 }
818 /* if buggy server returns . and .. late do 831 /*
819 we want to check for that here? */ 832 * if buggy server returns . and .. late do we want to
820 rc = cifs_filldir(current_entry, file, 833 * check for that here?
821 filldir, direntry, tmp_buf, max_len); 834 */
835 rc = cifs_filldir(current_entry, file, filldir,
836 direntry, tmp_buf, max_len);
822 if (rc == -EOVERFLOW) { 837 if (rc == -EOVERFLOW) {
823 rc = 0; 838 rc = 0;
824 break; 839 break;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 382c06d01b38..76809f4d3428 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -876,7 +876,8 @@ ssetup_ntlmssp_authenticate:
876 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 876 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
877 smb_buf = (struct smb_hdr *)iov[0].iov_base; 877 smb_buf = (struct smb_hdr *)iov[0].iov_base;
878 878
879 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == 879 if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
880 (smb_buf->Status.CifsError ==
880 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { 881 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
881 if (phase != NtLmNegotiate) { 882 if (phase != NtLmNegotiate) {
882 cERROR(1, "Unexpected more processing error"); 883 cERROR(1, "Unexpected more processing error");
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3129ac74b819..56cc4be87807 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -17,6 +17,8 @@
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20#include <linux/pagemap.h>
21#include <linux/vfs.h>
20#include "cifsglob.h" 22#include "cifsglob.h"
21#include "cifsproto.h" 23#include "cifsproto.h"
22#include "cifs_debug.h" 24#include "cifs_debug.h"
@@ -63,7 +65,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
63static bool 65static bool
64cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2) 66cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
65{ 67{
66 return ob1->netfid == ob2->netfid; 68 return ob1->fid.netfid == ob2->fid.netfid;
67} 69}
68 70
69static unsigned int 71static unsigned int
@@ -410,6 +412,83 @@ cifs_negotiate(const unsigned int xid, struct cifs_ses *ses)
410 return rc; 412 return rc;
411} 413}
412 414
415static unsigned int
416cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
417{
418 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
419 struct TCP_Server_Info *server = tcon->ses->server;
420 unsigned int wsize;
421
422 /* start with specified wsize, or default */
423 if (volume_info->wsize)
424 wsize = volume_info->wsize;
425 else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
426 wsize = CIFS_DEFAULT_IOSIZE;
427 else
428 wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
429
430 /* can server support 24-bit write sizes? (via UNIX extensions) */
431 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
432 wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
433
434 /*
435 * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
436 * Limit it to max buffer offered by the server, minus the size of the
437 * WRITEX header, not including the 4 byte RFC1001 length.
438 */
439 if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
440 (!(server->capabilities & CAP_UNIX) &&
441 (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
442 wsize = min_t(unsigned int, wsize,
443 server->maxBuf - sizeof(WRITE_REQ) + 4);
444
445 /* hard limit of CIFS_MAX_WSIZE */
446 wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
447
448 return wsize;
449}
450
451static unsigned int
452cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
453{
454 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
455 struct TCP_Server_Info *server = tcon->ses->server;
456 unsigned int rsize, defsize;
457
458 /*
459 * Set default value...
460 *
461 * HACK alert! Ancient servers have very small buffers. Even though
462 * MS-CIFS indicates that servers are only limited by the client's
463 * bufsize for reads, testing against win98se shows that it throws
464 * INVALID_PARAMETER errors if you try to request too large a read.
465 * OS/2 just sends back short reads.
466 *
467 * If the server doesn't advertise CAP_LARGE_READ_X, then assume that
468 * it can't handle a read request larger than its MaxBufferSize either.
469 */
470 if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
471 defsize = CIFS_DEFAULT_IOSIZE;
472 else if (server->capabilities & CAP_LARGE_READ_X)
473 defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
474 else
475 defsize = server->maxBuf - sizeof(READ_RSP);
476
477 rsize = volume_info->rsize ? volume_info->rsize : defsize;
478
479 /*
480 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
481 * the client's MaxBufferSize.
482 */
483 if (!(server->capabilities & CAP_LARGE_READ_X))
484 rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
485
486 /* hard limit of CIFS_MAX_RSIZE */
487 rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
488
489 return rsize;
490}
491
413static void 492static void
414cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) 493cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
415{ 494{
@@ -489,6 +568,13 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
489 CIFS_MOUNT_MAP_SPECIAL_CHR); 568 CIFS_MOUNT_MAP_SPECIAL_CHR);
490} 569}
491 570
571static int
572cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
573 struct cifs_fid *fid, FILE_ALL_INFO *data)
574{
575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
576}
577
492static char * 578static char *
493cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, 579cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
494 struct cifs_tcon *tcon) 580 struct cifs_tcon *tcon)
@@ -607,6 +693,219 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
607 cifsInode->cifsAttrs = dosattrs; 693 cifsInode->cifsAttrs = dosattrs;
608} 694}
609 695
696static int
697cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
698 int disposition, int desired_access, int create_options,
699 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
700 struct cifs_sb_info *cifs_sb)
701{
702 if (!(tcon->ses->capabilities & CAP_NT_SMBS))
703 return SMBLegacyOpen(xid, tcon, path, disposition,
704 desired_access, create_options,
705 &fid->netfid, oplock, buf,
706 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
707 & CIFS_MOUNT_MAP_SPECIAL_CHR);
708 return CIFSSMBOpen(xid, tcon, path, disposition, desired_access,
709 create_options, &fid->netfid, oplock, buf,
710 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
711 CIFS_MOUNT_MAP_SPECIAL_CHR);
712}
713
714static void
715cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
716{
717 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
718 cfile->fid.netfid = fid->netfid;
719 cifs_set_oplock_level(cinode, oplock);
720 cinode->can_cache_brlcks = cinode->clientCanCacheAll;
721}
722
723static void
724cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon,
725 struct cifs_fid *fid)
726{
727 CIFSSMBClose(xid, tcon, fid->netfid);
728}
729
730static int
731cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
732 struct cifs_fid *fid)
733{
734 return CIFSSMBFlush(xid, tcon, fid->netfid);
735}
736
737static int
738cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
739 struct cifs_io_parms *parms, unsigned int *bytes_read,
740 char **buf, int *buf_type)
741{
742 parms->netfid = cfile->fid.netfid;
743 return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type);
744}
745
746static int
747cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
748 struct cifs_io_parms *parms, unsigned int *written,
749 struct kvec *iov, unsigned long nr_segs)
750{
751
752 parms->netfid = cfile->fid.netfid;
753 return CIFSSMBWrite2(xid, parms, written, iov, nr_segs);
754}
755
756static int
757smb_set_file_info(struct inode *inode, const char *full_path,
758 FILE_BASIC_INFO *buf, const unsigned int xid)
759{
760 int oplock = 0;
761 int rc;
762 __u16 netfid;
763 __u32 netpid;
764 struct cifsFileInfo *open_file;
765 struct cifsInodeInfo *cinode = CIFS_I(inode);
766 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
767 struct tcon_link *tlink = NULL;
768 struct cifs_tcon *tcon;
769 FILE_BASIC_INFO info_buf;
770
771 /* if the file is already open for write, just use that fileid */
772 open_file = find_writable_file(cinode, true);
773 if (open_file) {
774 netfid = open_file->fid.netfid;
775 netpid = open_file->pid;
776 tcon = tlink_tcon(open_file->tlink);
777 goto set_via_filehandle;
778 }
779
780 tlink = cifs_sb_tlink(cifs_sb);
781 if (IS_ERR(tlink)) {
782 rc = PTR_ERR(tlink);
783 tlink = NULL;
784 goto out;
785 }
786 tcon = tlink_tcon(tlink);
787
788 /*
789 * NT4 apparently returns success on this call, but it doesn't really
790 * work.
791 */
792 if (!(tcon->ses->flags & CIFS_SES_NT4)) {
793 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
794 cifs_sb->local_nls,
795 cifs_sb->mnt_cifs_flags &
796 CIFS_MOUNT_MAP_SPECIAL_CHR);
797 if (rc == 0) {
798 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
799 goto out;
800 } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
801 goto out;
802 }
803
804 cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported "
805 "by this server");
806 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
807 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
808 &netfid, &oplock, NULL, cifs_sb->local_nls,
809 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
810
811 if (rc != 0) {
812 if (rc == -EIO)
813 rc = -EINVAL;
814 goto out;
815 }
816
817 netpid = current->tgid;
818
819set_via_filehandle:
820 rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid);
821 if (!rc)
822 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
823
824 if (open_file == NULL)
825 CIFSSMBClose(xid, tcon, netfid);
826 else
827 cifsFileInfo_put(open_file);
828out:
829 if (tlink != NULL)
830 cifs_put_tlink(tlink);
831 return rc;
832}
833
834static int
835cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
836 const char *path, struct cifs_sb_info *cifs_sb,
837 struct cifs_fid *fid, __u16 search_flags,
838 struct cifs_search_info *srch_inf)
839{
840 return CIFSFindFirst(xid, tcon, path, cifs_sb,
841 &fid->netfid, search_flags, srch_inf, true);
842}
843
844static int
845cifs_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
846 struct cifs_fid *fid, __u16 search_flags,
847 struct cifs_search_info *srch_inf)
848{
849 return CIFSFindNext(xid, tcon, fid->netfid, search_flags, srch_inf);
850}
851
852static int
853cifs_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
854 struct cifs_fid *fid)
855{
856 return CIFSFindClose(xid, tcon, fid->netfid);
857}
858
859static int
860cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
861 struct cifsInodeInfo *cinode)
862{
863 return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0,
864 LOCKING_ANDX_OPLOCK_RELEASE, false,
865 cinode->clientCanCacheRead ? 1 : 0);
866}
867
868static int
869cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
870 struct kstatfs *buf)
871{
872 int rc = -EOPNOTSUPP;
873
874 buf->f_type = CIFS_MAGIC_NUMBER;
875
876 /*
877 * We could add a second check for a QFS Unix capability bit
878 */
879 if ((tcon->ses->capabilities & CAP_UNIX) &&
880 (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
881 rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
882
883 /*
884 * Only need to call the old QFSInfo if failed on newer one,
885 * e.g. by OS/2.
886 **/
887 if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
888 rc = CIFSSMBQFSInfo(xid, tcon, buf);
889
890 /*
891 * Some old Windows servers also do not support level 103, retry with
892 * older level one if old server failed the previous call or we
893 * bypassed it because we detected that this was an older LANMAN sess
894 */
895 if (rc)
896 rc = SMBOldQFSInfo(xid, tcon, buf);
897 return rc;
898}
899
900static int
901cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
902 __u64 length, __u32 type, int lock, int unlock, bool wait)
903{
904 return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->fid.netfid,
905 current->tgid, length, offset, unlock, lock,
906 (__u8)type, wait, 0);
907}
908
610struct smb_version_operations smb1_operations = { 909struct smb_version_operations smb1_operations = {
611 .send_cancel = send_nt_cancel, 910 .send_cancel = send_nt_cancel,
612 .compare_fids = cifs_compare_fids, 911 .compare_fids = cifs_compare_fids,
@@ -630,6 +929,8 @@ struct smb_version_operations smb1_operations = {
630 .check_trans2 = cifs_check_trans2, 929 .check_trans2 = cifs_check_trans2,
631 .need_neg = cifs_need_neg, 930 .need_neg = cifs_need_neg,
632 .negotiate = cifs_negotiate, 931 .negotiate = cifs_negotiate,
932 .negotiate_wsize = cifs_negotiate_wsize,
933 .negotiate_rsize = cifs_negotiate_rsize,
633 .sess_setup = CIFS_SessSetup, 934 .sess_setup = CIFS_SessSetup,
634 .logoff = CIFSSMBLogoff, 935 .logoff = CIFSSMBLogoff,
635 .tree_connect = CIFSTCon, 936 .tree_connect = CIFSTCon,
@@ -638,12 +939,37 @@ struct smb_version_operations smb1_operations = {
638 .qfs_tcon = cifs_qfs_tcon, 939 .qfs_tcon = cifs_qfs_tcon,
639 .is_path_accessible = cifs_is_path_accessible, 940 .is_path_accessible = cifs_is_path_accessible,
640 .query_path_info = cifs_query_path_info, 941 .query_path_info = cifs_query_path_info,
942 .query_file_info = cifs_query_file_info,
641 .get_srv_inum = cifs_get_srv_inum, 943 .get_srv_inum = cifs_get_srv_inum,
944 .set_path_size = CIFSSMBSetEOF,
945 .set_file_size = CIFSSMBSetFileSize,
946 .set_file_info = smb_set_file_info,
642 .build_path_to_root = cifs_build_path_to_root, 947 .build_path_to_root = cifs_build_path_to_root,
643 .echo = CIFSSMBEcho, 948 .echo = CIFSSMBEcho,
644 .mkdir = CIFSSMBMkDir, 949 .mkdir = CIFSSMBMkDir,
645 .mkdir_setinfo = cifs_mkdir_setinfo, 950 .mkdir_setinfo = cifs_mkdir_setinfo,
646 .rmdir = CIFSSMBRmDir, 951 .rmdir = CIFSSMBRmDir,
952 .unlink = CIFSSMBDelFile,
953 .rename_pending_delete = cifs_rename_pending_delete,
954 .rename = CIFSSMBRename,
955 .create_hardlink = CIFSCreateHardLink,
956 .open = cifs_open_file,
957 .set_fid = cifs_set_fid,
958 .close = cifs_close_file,
959 .flush = cifs_flush_file,
960 .async_readv = cifs_async_readv,
961 .async_writev = cifs_async_writev,
962 .sync_read = cifs_sync_read,
963 .sync_write = cifs_sync_write,
964 .query_dir_first = cifs_query_dir_first,
965 .query_dir_next = cifs_query_dir_next,
966 .close_dir = cifs_close_dir,
967 .calc_smb_size = smbCalcSize,
968 .oplock_response = cifs_oplock_response,
969 .queryfs = cifs_queryfs,
970 .mand_lock = cifs_mand_lock,
971 .mand_unlock_range = cifs_unlock_range,
972 .push_mand_locks = cifs_push_mandatory_locks,
647}; 973};
648 974
649struct smb_version_values smb1_values = { 975struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
new file mode 100644
index 000000000000..a93eec30a50d
--- /dev/null
+++ b/fs/cifs/smb2file.c
@@ -0,0 +1,302 @@
1/*
2 * fs/cifs/smb2file.c
3 *
4 * Copyright (C) International Business Machines Corp., 2002, 2011
5 * Author(s): Steve French (sfrench@us.ibm.com),
6 * Pavel Shilovsky ((pshilovsky@samba.org) 2012
7 *
8 * This library is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published
10 * by the Free Software Foundation; either version 2.1 of the License, or
11 * (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
16 * the GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this library; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22#include <linux/fs.h>
23#include <linux/stat.h>
24#include <linux/slab.h>
25#include <linux/pagemap.h>
26#include <asm/div64.h>
27#include "cifsfs.h"
28#include "cifspdu.h"
29#include "cifsglob.h"
30#include "cifsproto.h"
31#include "cifs_debug.h"
32#include "cifs_fs_sb.h"
33#include "cifs_unicode.h"
34#include "fscache.h"
35#include "smb2proto.h"
36
37void
38smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
39{
40 oplock &= 0xFF;
41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
42 return;
43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
44 cinode->clientCanCacheAll = true;
45 cinode->clientCanCacheRead = true;
46 cFYI(1, "Exclusive Oplock granted on inode %p",
47 &cinode->vfs_inode);
48 } else if (oplock == SMB2_OPLOCK_LEVEL_II) {
49 cinode->clientCanCacheAll = false;
50 cinode->clientCanCacheRead = true;
51 cFYI(1, "Level II Oplock granted on inode %p",
52 &cinode->vfs_inode);
53 } else {
54 cinode->clientCanCacheAll = false;
55 cinode->clientCanCacheRead = false;
56 }
57}
58
59int
60smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
61 int disposition, int desired_access, int create_options,
62 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
63 struct cifs_sb_info *cifs_sb)
64{
65 int rc;
66 __le16 *smb2_path;
67 struct smb2_file_all_info *smb2_data = NULL;
68 __u8 smb2_oplock[17];
69
70 smb2_path = cifs_convert_path_to_utf16(path, cifs_sb);
71 if (smb2_path == NULL) {
72 rc = -ENOMEM;
73 goto out;
74 }
75
76 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
77 GFP_KERNEL);
78 if (smb2_data == NULL) {
79 rc = -ENOMEM;
80 goto out;
81 }
82
83 desired_access |= FILE_READ_ATTRIBUTES;
84 *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
85
86 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
88
89 rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid,
90 &fid->volatile_fid, desired_access, disposition,
91 0, 0, smb2_oplock, smb2_data);
92 if (rc)
93 goto out;
94
95 if (buf) {
96 /* open response does not have IndexNumber field - get it */
97 rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid,
98 fid->volatile_fid,
99 &smb2_data->IndexNumber);
100 if (rc) {
101 /* let get_inode_info disable server inode numbers */
102 smb2_data->IndexNumber = 0;
103 rc = 0;
104 }
105 move_smb2_info_to_cifs(buf, smb2_data);
106 }
107
108 *oplock = *smb2_oplock;
109out:
110 kfree(smb2_data);
111 kfree(smb2_path);
112 return rc;
113}
114
115int
116smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
117 const unsigned int xid)
118{
119 int rc = 0, stored_rc;
120 unsigned int max_num, num = 0, max_buf;
121 struct smb2_lock_element *buf, *cur;
122 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
123 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
124 struct cifsLockInfo *li, *tmp;
125 __u64 length = 1 + flock->fl_end - flock->fl_start;
126 struct list_head tmp_llist;
127
128 INIT_LIST_HEAD(&tmp_llist);
129
130 /*
131 * Accessing maxBuf is racy with cifs_reconnect - need to store value
132 * and check it for zero before using.
133 */
134 max_buf = tcon->ses->server->maxBuf;
135 if (!max_buf)
136 return -EINVAL;
137
138 max_num = max_buf / sizeof(struct smb2_lock_element);
139 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
140 if (!buf)
141 return -ENOMEM;
142
143 cur = buf;
144
145 down_write(&cinode->lock_sem);
146 list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
147 if (flock->fl_start > li->offset ||
148 (flock->fl_start + length) <
149 (li->offset + li->length))
150 continue;
151 if (current->tgid != li->pid)
152 continue;
153 if (cinode->can_cache_brlcks) {
154 /*
155 * We can cache brlock requests - simply remove a lock
156 * from the file's list.
157 */
158 list_del(&li->llist);
159 cifs_del_lock_waiters(li);
160 kfree(li);
161 continue;
162 }
163 cur->Length = cpu_to_le64(li->length);
164 cur->Offset = cpu_to_le64(li->offset);
165 cur->Flags = cpu_to_le32(SMB2_LOCKFLAG_UNLOCK);
166 /*
167 * We need to save a lock here to let us add it again to the
168 * file's list if the unlock range request fails on the server.
169 */
170 list_move(&li->llist, &tmp_llist);
171 if (++num == max_num) {
172 stored_rc = smb2_lockv(xid, tcon,
173 cfile->fid.persistent_fid,
174 cfile->fid.volatile_fid,
175 current->tgid, num, buf);
176 if (stored_rc) {
177 /*
178 * We failed on the unlock range request - add
179 * all locks from the tmp list to the head of
180 * the file's list.
181 */
182 cifs_move_llist(&tmp_llist,
183 &cfile->llist->locks);
184 rc = stored_rc;
185 } else
186 /*
187 * The unlock range request succeed - free the
188 * tmp list.
189 */
190 cifs_free_llist(&tmp_llist);
191 cur = buf;
192 num = 0;
193 } else
194 cur++;
195 }
196 if (num) {
197 stored_rc = smb2_lockv(xid, tcon, cfile->fid.persistent_fid,
198 cfile->fid.volatile_fid, current->tgid,
199 num, buf);
200 if (stored_rc) {
201 cifs_move_llist(&tmp_llist, &cfile->llist->locks);
202 rc = stored_rc;
203 } else
204 cifs_free_llist(&tmp_llist);
205 }
206 up_write(&cinode->lock_sem);
207
208 kfree(buf);
209 return rc;
210}
211
212static int
213smb2_push_mand_fdlocks(struct cifs_fid_locks *fdlocks, const unsigned int xid,
214 struct smb2_lock_element *buf, unsigned int max_num)
215{
216 int rc = 0, stored_rc;
217 struct cifsFileInfo *cfile = fdlocks->cfile;
218 struct cifsLockInfo *li;
219 unsigned int num = 0;
220 struct smb2_lock_element *cur = buf;
221 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
222
223 list_for_each_entry(li, &fdlocks->locks, llist) {
224 cur->Length = cpu_to_le64(li->length);
225 cur->Offset = cpu_to_le64(li->offset);
226 cur->Flags = cpu_to_le32(li->type |
227 SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
228 if (++num == max_num) {
229 stored_rc = smb2_lockv(xid, tcon,
230 cfile->fid.persistent_fid,
231 cfile->fid.volatile_fid,
232 current->tgid, num, buf);
233 if (stored_rc)
234 rc = stored_rc;
235 cur = buf;
236 num = 0;
237 } else
238 cur++;
239 }
240 if (num) {
241 stored_rc = smb2_lockv(xid, tcon,
242 cfile->fid.persistent_fid,
243 cfile->fid.volatile_fid,
244 current->tgid, num, buf);
245 if (stored_rc)
246 rc = stored_rc;
247 }
248
249 return rc;
250}
251
252int
253smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
254{
255 int rc = 0, stored_rc;
256 unsigned int xid;
257 unsigned int max_num, max_buf;
258 struct smb2_lock_element *buf;
259 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
260 struct cifs_fid_locks *fdlocks;
261
262 xid = get_xid();
263 /* we are going to update can_cache_brlcks here - need a write access */
264 down_write(&cinode->lock_sem);
265 if (!cinode->can_cache_brlcks) {
266 up_write(&cinode->lock_sem);
267 free_xid(xid);
268 return rc;
269 }
270
271 /*
272 * Accessing maxBuf is racy with cifs_reconnect - need to store value
273 * and check it for zero before using.
274 */
275 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
276 if (!max_buf) {
277 up_write(&cinode->lock_sem);
278 free_xid(xid);
279 return -EINVAL;
280 }
281
282 max_num = max_buf / sizeof(struct smb2_lock_element);
283 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
284 if (!buf) {
285 up_write(&cinode->lock_sem);
286 free_xid(xid);
287 return -ENOMEM;
288 }
289
290 list_for_each_entry(fdlocks, &cinode->llist, llist) {
291 stored_rc = smb2_push_mand_fdlocks(fdlocks, xid, buf, max_num);
292 if (stored_rc)
293 rc = stored_rc;
294 }
295
296 cinode->can_cache_brlcks = false;
297 kfree(buf);
298
299 up_write(&cinode->lock_sem);
300 free_xid(xid);
301 return rc;
302}
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 33c1d89090c0..7c0e2143e775 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -23,6 +23,8 @@
23#ifndef _SMB2_GLOB_H 23#ifndef _SMB2_GLOB_H
24#define _SMB2_GLOB_H 24#define _SMB2_GLOB_H
25 25
26#define SMB2_MAGIC_NUMBER 0xFE534D42
27
26/* 28/*
27 ***************************************************************** 29 *****************************************************************
28 * Constants go here 30 * Constants go here
@@ -40,5 +42,17 @@
40#define SMB2_OP_MKDIR 5 42#define SMB2_OP_MKDIR 5
41#define SMB2_OP_RENAME 6 43#define SMB2_OP_RENAME 6
42#define SMB2_OP_DELETE 7 44#define SMB2_OP_DELETE 7
45#define SMB2_OP_HARDLINK 8
46#define SMB2_OP_SET_EOF 9
47
48/* Used when constructing chained read requests. */
49#define CHAINED_REQUEST 1
50#define START_OF_CHAIN 2
51#define END_OF_CHAIN 4
52#define RELATED_REQUEST 8
53
54#define SMB2_SIGNATURE_SIZE (16)
55#define SMB2_NTLMV2_SESSKEY_SIZE (16)
56#define SMB2_HMACSHA256_SIZE (32)
43 57
44#endif /* _SMB2_GLOB_H */ 58#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 2aa5cb08c526..706482452df4 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -47,6 +47,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
47 int rc, tmprc = 0; 47 int rc, tmprc = 0;
48 u64 persistent_fid, volatile_fid; 48 u64 persistent_fid, volatile_fid;
49 __le16 *utf16_path; 49 __le16 *utf16_path;
50 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
50 51
51 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 52 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
52 if (!utf16_path) 53 if (!utf16_path)
@@ -54,7 +55,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
54 55
55 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 56 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
56 desired_access, create_disposition, file_attributes, 57 desired_access, create_disposition, file_attributes,
57 create_options); 58 create_options, &oplock, NULL);
58 if (rc) { 59 if (rc) {
59 kfree(utf16_path); 60 kfree(utf16_path);
60 return rc; 61 return rc;
@@ -74,6 +75,22 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
74 * SMB2_open() call. 75 * SMB2_open() call.
75 */ 76 */
76 break; 77 break;
78 case SMB2_OP_RENAME:
79 tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid,
80 (__le16 *)data);
81 break;
82 case SMB2_OP_HARDLINK:
83 tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid,
84 volatile_fid, (__le16 *)data);
85 break;
86 case SMB2_OP_SET_EOF:
87 tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid,
88 current->tgid, (__le64 *)data);
89 break;
90 case SMB2_OP_SET_INFO:
91 tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid,
92 (FILE_BASIC_INFO *)data);
93 break;
77 default: 94 default:
78 cERROR(1, "Invalid command"); 95 cERROR(1, "Invalid command");
79 break; 96 break;
@@ -86,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
86 return rc; 103 return rc;
87} 104}
88 105
89static void 106void
90move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) 107move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
91{ 108{
92 memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); 109 memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src);
@@ -161,3 +178,80 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
161 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, 178 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
162 NULL, SMB2_OP_DELETE); 179 NULL, SMB2_OP_DELETE);
163} 180}
181
182int
183smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
184 struct cifs_sb_info *cifs_sb)
185{
186 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
187 0, CREATE_DELETE_ON_CLOSE, NULL,
188 SMB2_OP_DELETE);
189}
190
191static int
192smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
193 const char *from_name, const char *to_name,
194 struct cifs_sb_info *cifs_sb, __u32 access, int command)
195{
196 __le16 *smb2_to_name = NULL;
197 int rc;
198
199 smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb);
200 if (smb2_to_name == NULL) {
201 rc = -ENOMEM;
202 goto smb2_rename_path;
203 }
204
205 rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
206 FILE_OPEN, 0, 0, smb2_to_name, command);
207smb2_rename_path:
208 kfree(smb2_to_name);
209 return rc;
210}
211
212int
213smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
214 const char *from_name, const char *to_name,
215 struct cifs_sb_info *cifs_sb)
216{
217 return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
218 DELETE, SMB2_OP_RENAME);
219}
220
221int
222smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
223 const char *from_name, const char *to_name,
224 struct cifs_sb_info *cifs_sb)
225{
226 return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
227 FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK);
228}
229
230int
231smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
232 const char *full_path, __u64 size,
233 struct cifs_sb_info *cifs_sb, bool set_alloc)
234{
235 __le64 eof = cpu_to_le64(size);
236 return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
237 FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof,
238 SMB2_OP_SET_EOF);
239}
240
241int
242smb2_set_file_info(struct inode *inode, const char *full_path,
243 FILE_BASIC_INFO *buf, const unsigned int xid)
244{
245 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
246 struct tcon_link *tlink;
247 int rc;
248
249 tlink = cifs_sb_tlink(cifs_sb);
250 if (IS_ERR(tlink))
251 return PTR_ERR(tlink);
252 rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
253 FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf,
254 SMB2_OP_SET_INFO);
255 cifs_put_tlink(tlink);
256 return rc;
257}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index be41478acc05..494c912c76fe 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -453,7 +453,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
453 {STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"}, 453 {STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"},
454 {STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO, 454 {STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO,
455 "STATUS_ALLOTTED_SPACE_EXCEEDED"}, 455 "STATUS_ALLOTTED_SPACE_EXCEEDED"},
456 {STATUS_INSUFFICIENT_RESOURCES, -EIO, "STATUS_INSUFFICIENT_RESOURCES"}, 456 {STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO,
457 "STATUS_INSUFFICIENT_RESOURCES"},
457 {STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"}, 458 {STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"},
458 {STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"}, 459 {STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"},
459 {STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"}, 460 {STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"},
@@ -2455,7 +2456,8 @@ map_smb2_to_linux_error(char *buf, bool log_err)
2455 return 0; 2456 return 0;
2456 2457
2457 /* mask facility */ 2458 /* mask facility */
2458 if (log_err && (smb2err != (STATUS_MORE_PROCESSING_REQUIRED))) 2459 if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
2460 (smb2err != STATUS_END_OF_FILE))
2459 smb2_print_status(smb2err); 2461 smb2_print_status(smb2err);
2460 else if (cifsFYI & CIFS_RC) 2462 else if (cifsFYI & CIFS_RC)
2461 smb2_print_status(smb2err); 2463 smb2_print_status(smb2err);
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index e4d3b9964167..7b1c5e3287fb 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -142,12 +142,19 @@ smb2_check_message(char *buf, unsigned int length)
142 } 142 }
143 143
144 if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { 144 if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
145 if (hdr->Status == 0 || 145 if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
146 pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2) { 146 pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
147 /* error packets have 9 byte structure size */ 147 /* error packets have 9 byte structure size */
148 cERROR(1, "Illegal response size %u for command %d", 148 cERROR(1, "Illegal response size %u for command %d",
149 le16_to_cpu(pdu->StructureSize2), command); 149 le16_to_cpu(pdu->StructureSize2), command);
150 return 1; 150 return 1;
151 } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
152 && (le16_to_cpu(pdu->StructureSize2) != 44)
153 && (le16_to_cpu(pdu->StructureSize2) != 36)) {
154 /* special case for SMB2.1 lease break message */
155 cERROR(1, "Illegal response size %d for oplock break",
156 le16_to_cpu(pdu->StructureSize2));
157 return 1;
151 } 158 }
152 } 159 }
153 160
@@ -162,6 +169,9 @@ smb2_check_message(char *buf, unsigned int length)
162 if (4 + len != clc_len) { 169 if (4 + len != clc_len) {
163 cFYI(1, "Calculated size %u length %u mismatch mid %llu", 170 cFYI(1, "Calculated size %u length %u mismatch mid %llu",
164 clc_len, 4 + len, mid); 171 clc_len, 4 + len, mid);
172 /* Windows 7 server returns 24 bytes more */
173 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
174 return 0;
165 /* server can return one byte more */ 175 /* server can return one byte more */
166 if (clc_len == 4 + len + 1) 176 if (clc_len == 4 + len + 1)
167 return 0; 177 return 0;
@@ -244,7 +254,15 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
244 ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength); 254 ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
245 break; 255 break;
246 case SMB2_READ: 256 case SMB2_READ:
257 *off = ((struct smb2_read_rsp *)hdr)->DataOffset;
258 *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
259 break;
247 case SMB2_QUERY_DIRECTORY: 260 case SMB2_QUERY_DIRECTORY:
261 *off = le16_to_cpu(
262 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
263 *len = le32_to_cpu(
264 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
265 break;
248 case SMB2_IOCTL: 266 case SMB2_IOCTL:
249 case SMB2_CHANGE_NOTIFY: 267 case SMB2_CHANGE_NOTIFY:
250 default: 268 default:
@@ -287,8 +305,9 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
287 * portion, the number of word parameters and the data portion of the message. 305 * portion, the number of word parameters and the data portion of the message.
288 */ 306 */
289unsigned int 307unsigned int
290smb2_calc_size(struct smb2_hdr *hdr) 308smb2_calc_size(void *buf)
291{ 309{
310 struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
292 struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; 311 struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
293 int offset; /* the offset from the beginning of SMB to data area */ 312 int offset; /* the offset from the beginning of SMB to data area */
294 int data_length; /* the length of the variable length data area */ 313 int data_length; /* the length of the variable length data area */
@@ -347,3 +366,218 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
347 CIFS_MOUNT_MAP_SPECIAL_CHR); 366 CIFS_MOUNT_MAP_SPECIAL_CHR);
348 return to; 367 return to;
349} 368}
369
370__le32
371smb2_get_lease_state(struct cifsInodeInfo *cinode)
372{
373 if (cinode->clientCanCacheAll)
374 return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
375 else if (cinode->clientCanCacheRead)
376 return SMB2_LEASE_READ_CACHING;
377 return 0;
378}
379
380__u8 smb2_map_lease_to_oplock(__le32 lease_state)
381{
382 if (lease_state & SMB2_LEASE_WRITE_CACHING) {
383 if (lease_state & SMB2_LEASE_HANDLE_CACHING)
384 return SMB2_OPLOCK_LEVEL_BATCH;
385 else
386 return SMB2_OPLOCK_LEVEL_EXCLUSIVE;
387 } else if (lease_state & SMB2_LEASE_READ_CACHING)
388 return SMB2_OPLOCK_LEVEL_II;
389 return 0;
390}
391
392struct smb2_lease_break_work {
393 struct work_struct lease_break;
394 struct tcon_link *tlink;
395 __u8 lease_key[16];
396 __le32 lease_state;
397};
398
399static void
400cifs_ses_oplock_break(struct work_struct *work)
401{
402 struct smb2_lease_break_work *lw = container_of(work,
403 struct smb2_lease_break_work, lease_break);
404 int rc;
405
406 rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
407 lw->lease_state);
408 cFYI(1, "Lease release rc %d", rc);
409 cifs_put_tlink(lw->tlink);
410 kfree(lw);
411}
412
413static bool
414smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
415{
416 struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
417 struct list_head *tmp, *tmp1, *tmp2;
418 struct cifs_ses *ses;
419 struct cifs_tcon *tcon;
420 struct cifsInodeInfo *cinode;
421 struct cifsFileInfo *cfile;
422 struct cifs_pending_open *open;
423 struct smb2_lease_break_work *lw;
424 bool found;
425 int ack_req = le32_to_cpu(rsp->Flags &
426 SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
427
428 lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
429 if (!lw) {
430 cERROR(1, "Memory allocation failed during lease break check");
431 return false;
432 }
433
434 INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
435 lw->lease_state = rsp->NewLeaseState;
436
437 cFYI(1, "Checking for lease break");
438
439 /* look up tcon based on tid & uid */
440 spin_lock(&cifs_tcp_ses_lock);
441 list_for_each(tmp, &server->smb_ses_list) {
442 ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
443
444 spin_lock(&cifs_file_list_lock);
445 list_for_each(tmp1, &ses->tcon_list) {
446 tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
447
448 cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
449 list_for_each(tmp2, &tcon->openFileList) {
450 cfile = list_entry(tmp2, struct cifsFileInfo,
451 tlist);
452 cinode = CIFS_I(cfile->dentry->d_inode);
453
454 if (memcmp(cinode->lease_key, rsp->LeaseKey,
455 SMB2_LEASE_KEY_SIZE))
456 continue;
457
458 cFYI(1, "found in the open list");
459 cFYI(1, "lease key match, lease break 0x%d",
460 le32_to_cpu(rsp->NewLeaseState));
461
462 smb2_set_oplock_level(cinode,
463 smb2_map_lease_to_oplock(rsp->NewLeaseState));
464
465 if (ack_req)
466 cfile->oplock_break_cancelled = false;
467 else
468 cfile->oplock_break_cancelled = true;
469
470 queue_work(cifsiod_wq, &cfile->oplock_break);
471
472 spin_unlock(&cifs_file_list_lock);
473 spin_unlock(&cifs_tcp_ses_lock);
474 return true;
475 }
476
477 found = false;
478 list_for_each_entry(open, &tcon->pending_opens, olist) {
479 if (memcmp(open->lease_key, rsp->LeaseKey,
480 SMB2_LEASE_KEY_SIZE))
481 continue;
482
483 if (!found && ack_req) {
484 found = true;
485 memcpy(lw->lease_key, open->lease_key,
486 SMB2_LEASE_KEY_SIZE);
487 lw->tlink = cifs_get_tlink(open->tlink);
488 queue_work(cifsiod_wq,
489 &lw->lease_break);
490 }
491
492 cFYI(1, "found in the pending open list");
493 cFYI(1, "lease key match, lease break 0x%d",
494 le32_to_cpu(rsp->NewLeaseState));
495
496 open->oplock =
497 smb2_map_lease_to_oplock(rsp->NewLeaseState);
498 }
499 if (found) {
500 spin_unlock(&cifs_file_list_lock);
501 spin_unlock(&cifs_tcp_ses_lock);
502 return true;
503 }
504 }
505 spin_unlock(&cifs_file_list_lock);
506 }
507 spin_unlock(&cifs_tcp_ses_lock);
508 kfree(lw);
509 cFYI(1, "Can not process lease break - no lease matched");
510 return false;
511}
512
513bool
514smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
515{
516 struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
517 struct list_head *tmp, *tmp1, *tmp2;
518 struct cifs_ses *ses;
519 struct cifs_tcon *tcon;
520 struct cifsInodeInfo *cinode;
521 struct cifsFileInfo *cfile;
522
523 cFYI(1, "Checking for oplock break");
524
525 if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
526 return false;
527
528 if (rsp->StructureSize !=
529 smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
530 if (le16_to_cpu(rsp->StructureSize) == 44)
531 return smb2_is_valid_lease_break(buffer, server);
532 else
533 return false;
534 }
535
536 cFYI(1, "oplock level 0x%d", rsp->OplockLevel);
537
538 /* look up tcon based on tid & uid */
539 spin_lock(&cifs_tcp_ses_lock);
540 list_for_each(tmp, &server->smb_ses_list) {
541 ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
542 list_for_each(tmp1, &ses->tcon_list) {
543 tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
544
545 cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
546 spin_lock(&cifs_file_list_lock);
547 list_for_each(tmp2, &tcon->openFileList) {
548 cfile = list_entry(tmp2, struct cifsFileInfo,
549 tlist);
550 if (rsp->PersistentFid !=
551 cfile->fid.persistent_fid ||
552 rsp->VolatileFid !=
553 cfile->fid.volatile_fid)
554 continue;
555
556 cFYI(1, "file id match, oplock break");
557 cinode = CIFS_I(cfile->dentry->d_inode);
558
559 if (!cinode->clientCanCacheAll &&
560 rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
561 cfile->oplock_break_cancelled = true;
562 else
563 cfile->oplock_break_cancelled = false;
564
565 smb2_set_oplock_level(cinode,
566 rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0);
567
568 queue_work(cifsiod_wq, &cfile->oplock_break);
569
570 spin_unlock(&cifs_file_list_lock);
571 spin_unlock(&cifs_tcp_ses_lock);
572 return true;
573 }
574 spin_unlock(&cifs_file_list_lock);
575 spin_unlock(&cifs_tcp_ses_lock);
576 cFYI(1, "No matching file for oplock break");
577 return true;
578 }
579 }
580 spin_unlock(&cifs_tcp_ses_lock);
581 cFYI(1, "Can not process oplock break for non-existent connection");
582 return false;
583}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 826209bf3684..4d9dbe0b7385 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -17,11 +17,15 @@
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20#include <linux/pagemap.h>
21#include <linux/vfs.h>
20#include "cifsglob.h" 22#include "cifsglob.h"
21#include "smb2pdu.h" 23#include "smb2pdu.h"
22#include "smb2proto.h" 24#include "smb2proto.h"
23#include "cifsproto.h" 25#include "cifsproto.h"
24#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "smb2status.h"
28#include "smb2glob.h"
25 29
26static int 30static int
27change_conf(struct TCP_Server_Info *server) 31change_conf(struct TCP_Server_Info *server)
@@ -63,6 +67,17 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
63 server->in_flight--; 67 server->in_flight--;
64 if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) 68 if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
65 rc = change_conf(server); 69 rc = change_conf(server);
70 /*
71 * Sometimes server returns 0 credits on oplock break ack - we need to
72 * rebalance credits in this case.
73 */
74 else if (server->in_flight > 0 && server->oplock_credits == 0 &&
75 server->oplocks) {
76 if (server->credits > 1) {
77 server->credits--;
78 server->oplock_credits++;
79 }
80 }
66 spin_unlock(&server->req_lock); 81 spin_unlock(&server->req_lock);
67 wake_up(&server->request_q); 82 wake_up(&server->request_q);
68 if (rc) 83 if (rc)
@@ -157,6 +172,42 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
157 return rc; 172 return rc;
158} 173}
159 174
175static unsigned int
176smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
177{
178 struct TCP_Server_Info *server = tcon->ses->server;
179 unsigned int wsize;
180
181 /* start with specified wsize, or default */
182 wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
183 wsize = min_t(unsigned int, wsize, server->max_write);
184 /*
185 * limit write size to 2 ** 16, because we don't support multicredit
186 * requests now.
187 */
188 wsize = min_t(unsigned int, wsize, 2 << 15);
189
190 return wsize;
191}
192
193static unsigned int
194smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
195{
196 struct TCP_Server_Info *server = tcon->ses->server;
197 unsigned int rsize;
198
199 /* start with specified rsize, or default */
200 rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
201 rsize = min_t(unsigned int, rsize, server->max_read);
202 /*
203 * limit write size to 2 ** 16, because we don't support multicredit
204 * requests now.
205 */
206 rsize = min_t(unsigned int, rsize, 2 << 15);
207
208 return rsize;
209}
210
160static int 211static int
161smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, 212smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
162 struct cifs_sb_info *cifs_sb, const char *full_path) 213 struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -164,13 +215,14 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
164 int rc; 215 int rc;
165 __u64 persistent_fid, volatile_fid; 216 __u64 persistent_fid, volatile_fid;
166 __le16 *utf16_path; 217 __le16 *utf16_path;
218 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
167 219
168 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 220 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
169 if (!utf16_path) 221 if (!utf16_path)
170 return -ENOMEM; 222 return -ENOMEM;
171 223
172 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 224 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
173 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0); 225 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
174 if (rc) { 226 if (rc) {
175 kfree(utf16_path); 227 kfree(utf16_path);
176 return rc; 228 return rc;
@@ -190,6 +242,26 @@ smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
190 return 0; 242 return 0;
191} 243}
192 244
245static int
246smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
247 struct cifs_fid *fid, FILE_ALL_INFO *data)
248{
249 int rc;
250 struct smb2_file_all_info *smb2_data;
251
252 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
253 GFP_KERNEL);
254 if (smb2_data == NULL)
255 return -ENOMEM;
256
257 rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid,
258 smb2_data);
259 if (!rc)
260 move_smb2_info_to_cifs(data, smb2_data);
261 kfree(smb2_data);
262 return rc;
263}
264
193static char * 265static char *
194smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, 266smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
195 struct cifs_tcon *tcon) 267 struct cifs_tcon *tcon)
@@ -292,7 +364,221 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
292#endif 364#endif
293} 365}
294 366
367static void
368smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
369{
370 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
371 cfile->fid.persistent_fid = fid->persistent_fid;
372 cfile->fid.volatile_fid = fid->volatile_fid;
373 smb2_set_oplock_level(cinode, oplock);
374 cinode->can_cache_brlcks = cinode->clientCanCacheAll;
375}
376
377static void
378smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
379 struct cifs_fid *fid)
380{
381 SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
382}
383
384static int
385smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
386 struct cifs_fid *fid)
387{
388 return SMB2_flush(xid, tcon, fid->persistent_fid, fid->volatile_fid);
389}
390
391static unsigned int
392smb2_read_data_offset(char *buf)
393{
394 struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
395 return rsp->DataOffset;
396}
397
398static unsigned int
399smb2_read_data_length(char *buf)
400{
401 struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
402 return le32_to_cpu(rsp->DataLength);
403}
404
405
406static int
407smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
408 struct cifs_io_parms *parms, unsigned int *bytes_read,
409 char **buf, int *buf_type)
410{
411 parms->persistent_fid = cfile->fid.persistent_fid;
412 parms->volatile_fid = cfile->fid.volatile_fid;
413 return SMB2_read(xid, parms, bytes_read, buf, buf_type);
414}
415
416static int
417smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
418 struct cifs_io_parms *parms, unsigned int *written,
419 struct kvec *iov, unsigned long nr_segs)
420{
421
422 parms->persistent_fid = cfile->fid.persistent_fid;
423 parms->volatile_fid = cfile->fid.volatile_fid;
424 return SMB2_write(xid, parms, written, iov, nr_segs);
425}
426
427static int
428smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
429 struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
430{
431 __le64 eof = cpu_to_le64(size);
432 return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
433 cfile->fid.volatile_fid, cfile->pid, &eof);
434}
435
436static int
437smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
438 const char *path, struct cifs_sb_info *cifs_sb,
439 struct cifs_fid *fid, __u16 search_flags,
440 struct cifs_search_info *srch_inf)
441{
442 __le16 *utf16_path;
443 int rc;
444 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
445 __u64 persistent_fid, volatile_fid;
446
447 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
448 if (!utf16_path)
449 return -ENOMEM;
450
451 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
452 FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0,
453 &oplock, NULL);
454 kfree(utf16_path);
455 if (rc) {
456 cERROR(1, "open dir failed");
457 return rc;
458 }
459
460 srch_inf->entries_in_buffer = 0;
461 srch_inf->index_of_last_entry = 0;
462 fid->persistent_fid = persistent_fid;
463 fid->volatile_fid = volatile_fid;
464
465 rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
466 srch_inf);
467 if (rc) {
468 cERROR(1, "query directory failed");
469 SMB2_close(xid, tcon, persistent_fid, volatile_fid);
470 }
471 return rc;
472}
473
474static int
475smb2_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
476 struct cifs_fid *fid, __u16 search_flags,
477 struct cifs_search_info *srch_inf)
478{
479 return SMB2_query_directory(xid, tcon, fid->persistent_fid,
480 fid->volatile_fid, 0, srch_inf);
481}
482
483static int
484smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
485 struct cifs_fid *fid)
486{
487 return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
488}
489
490/*
491* If we negotiate SMB2 protocol and get STATUS_PENDING - update
492* the number of credits and return true. Otherwise - return false.
493*/
494static bool
495smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
496{
497 struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
498
499 if (hdr->Status != STATUS_PENDING)
500 return false;
501
502 if (!length) {
503 spin_lock(&server->req_lock);
504 server->credits += le16_to_cpu(hdr->CreditRequest);
505 spin_unlock(&server->req_lock);
506 wake_up(&server->request_q);
507 }
508
509 return true;
510}
511
512static int
513smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
514 struct cifsInodeInfo *cinode)
515{
516 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
517 return SMB2_lease_break(0, tcon, cinode->lease_key,
518 smb2_get_lease_state(cinode));
519
520 return SMB2_oplock_break(0, tcon, fid->persistent_fid,
521 fid->volatile_fid,
522 cinode->clientCanCacheRead ? 1 : 0);
523}
524
525static int
526smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
527 struct kstatfs *buf)
528{
529 int rc;
530 u64 persistent_fid, volatile_fid;
531 __le16 srch_path = 0; /* Null - open root of share */
532 u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
533
534 rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid,
535 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
536 if (rc)
537 return rc;
538 buf->f_type = SMB2_MAGIC_NUMBER;
539 rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf);
540 SMB2_close(xid, tcon, persistent_fid, volatile_fid);
541 return rc;
542}
543
544static bool
545smb2_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
546{
547 return ob1->fid.persistent_fid == ob2->fid.persistent_fid &&
548 ob1->fid.volatile_fid == ob2->fid.volatile_fid;
549}
550
551static int
552smb2_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
553 __u64 length, __u32 type, int lock, int unlock, bool wait)
554{
555 if (unlock && !lock)
556 type = SMB2_LOCKFLAG_UNLOCK;
557 return SMB2_lock(xid, tlink_tcon(cfile->tlink),
558 cfile->fid.persistent_fid, cfile->fid.volatile_fid,
559 current->tgid, length, offset, type, wait);
560}
561
562static void
563smb2_get_lease_key(struct inode *inode, struct cifs_fid *fid)
564{
565 memcpy(fid->lease_key, CIFS_I(inode)->lease_key, SMB2_LEASE_KEY_SIZE);
566}
567
568static void
569smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid)
570{
571 memcpy(CIFS_I(inode)->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
572}
573
574static void
575smb2_new_lease_key(struct cifs_fid *fid)
576{
577 get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
578}
579
295struct smb_version_operations smb21_operations = { 580struct smb_version_operations smb21_operations = {
581 .compare_fids = smb2_compare_fids,
296 .setup_request = smb2_setup_request, 582 .setup_request = smb2_setup_request,
297 .setup_async_request = smb2_setup_async_request, 583 .setup_async_request = smb2_setup_async_request,
298 .check_receive = smb2_check_receive, 584 .check_receive = smb2_check_receive,
@@ -301,13 +587,19 @@ struct smb_version_operations smb21_operations = {
301 .get_credits_field = smb2_get_credits_field, 587 .get_credits_field = smb2_get_credits_field,
302 .get_credits = smb2_get_credits, 588 .get_credits = smb2_get_credits,
303 .get_next_mid = smb2_get_next_mid, 589 .get_next_mid = smb2_get_next_mid,
590 .read_data_offset = smb2_read_data_offset,
591 .read_data_length = smb2_read_data_length,
592 .map_error = map_smb2_to_linux_error,
304 .find_mid = smb2_find_mid, 593 .find_mid = smb2_find_mid,
305 .check_message = smb2_check_message, 594 .check_message = smb2_check_message,
306 .dump_detail = smb2_dump_detail, 595 .dump_detail = smb2_dump_detail,
307 .clear_stats = smb2_clear_stats, 596 .clear_stats = smb2_clear_stats,
308 .print_stats = smb2_print_stats, 597 .print_stats = smb2_print_stats,
598 .is_oplock_break = smb2_is_valid_oplock_break,
309 .need_neg = smb2_need_neg, 599 .need_neg = smb2_need_neg,
310 .negotiate = smb2_negotiate, 600 .negotiate = smb2_negotiate,
601 .negotiate_wsize = smb2_negotiate_wsize,
602 .negotiate_rsize = smb2_negotiate_rsize,
311 .sess_setup = SMB2_sess_setup, 603 .sess_setup = SMB2_sess_setup,
312 .logoff = SMB2_logoff, 604 .logoff = SMB2_logoff,
313 .tree_connect = SMB2_tcon, 605 .tree_connect = SMB2_tcon,
@@ -317,16 +609,68 @@ struct smb_version_operations smb21_operations = {
317 .echo = SMB2_echo, 609 .echo = SMB2_echo,
318 .query_path_info = smb2_query_path_info, 610 .query_path_info = smb2_query_path_info,
319 .get_srv_inum = smb2_get_srv_inum, 611 .get_srv_inum = smb2_get_srv_inum,
612 .query_file_info = smb2_query_file_info,
613 .set_path_size = smb2_set_path_size,
614 .set_file_size = smb2_set_file_size,
615 .set_file_info = smb2_set_file_info,
320 .build_path_to_root = smb2_build_path_to_root, 616 .build_path_to_root = smb2_build_path_to_root,
321 .mkdir = smb2_mkdir, 617 .mkdir = smb2_mkdir,
322 .mkdir_setinfo = smb2_mkdir_setinfo, 618 .mkdir_setinfo = smb2_mkdir_setinfo,
323 .rmdir = smb2_rmdir, 619 .rmdir = smb2_rmdir,
620 .unlink = smb2_unlink,
621 .rename = smb2_rename_path,
622 .create_hardlink = smb2_create_hardlink,
623 .open = smb2_open_file,
624 .set_fid = smb2_set_fid,
625 .close = smb2_close_file,
626 .flush = smb2_flush_file,
627 .async_readv = smb2_async_readv,
628 .async_writev = smb2_async_writev,
629 .sync_read = smb2_sync_read,
630 .sync_write = smb2_sync_write,
631 .query_dir_first = smb2_query_dir_first,
632 .query_dir_next = smb2_query_dir_next,
633 .close_dir = smb2_close_dir,
634 .calc_smb_size = smb2_calc_size,
635 .is_status_pending = smb2_is_status_pending,
636 .oplock_response = smb2_oplock_response,
637 .queryfs = smb2_queryfs,
638 .mand_lock = smb2_mand_lock,
639 .mand_unlock_range = smb2_unlock_range,
640 .push_mand_locks = smb2_push_mandatory_locks,
641 .get_lease_key = smb2_get_lease_key,
642 .set_lease_key = smb2_set_lease_key,
643 .new_lease_key = smb2_new_lease_key,
324}; 644};
325 645
326struct smb_version_values smb21_values = { 646struct smb_version_values smb21_values = {
327 .version_string = SMB21_VERSION_STRING, 647 .version_string = SMB21_VERSION_STRING,
648 .protocol_id = SMB21_PROT_ID,
649 .req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */
650 .large_lock_type = 0,
651 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
652 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
653 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
654 .header_size = sizeof(struct smb2_hdr),
655 .max_header_size = MAX_SMB2_HDR_SIZE,
656 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
657 .lock_cmd = SMB2_LOCK,
658 .cap_unix = 0,
659 .cap_nt_find = SMB2_NT_FIND,
660 .cap_large_files = SMB2_LARGE_FILES,
661};
662
663struct smb_version_values smb30_values = {
664 .version_string = SMB30_VERSION_STRING,
665 .protocol_id = SMB30_PROT_ID,
666 .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
667 .large_lock_type = 0,
668 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
669 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
670 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
328 .header_size = sizeof(struct smb2_hdr), 671 .header_size = sizeof(struct smb2_hdr),
329 .max_header_size = MAX_SMB2_HDR_SIZE, 672 .max_header_size = MAX_SMB2_HDR_SIZE,
673 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
330 .lock_cmd = SMB2_LOCK, 674 .lock_cmd = SMB2_LOCK,
331 .cap_unix = 0, 675 .cap_unix = 0,
332 .cap_nt_find = SMB2_NT_FIND, 676 .cap_nt_find = SMB2_NT_FIND,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 62b3f17d0613..cf33622cdac8 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smb2pdu.c 2 * fs/cifs/smb2pdu.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2009, 2011 4 * Copyright (C) International Business Machines Corp., 2009, 2012
5 * Etersoft, 2012 5 * Etersoft, 2012
6 * Author(s): Steve French (sfrench@us.ibm.com) 6 * Author(s): Steve French (sfrench@us.ibm.com)
7 * Pavel Shilovsky (pshilovsky@samba.org) 2012 7 * Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -31,7 +31,9 @@
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kernel.h> 32#include <linux/kernel.h>
33#include <linux/vfs.h> 33#include <linux/vfs.h>
34#include <linux/task_io_accounting_ops.h>
34#include <linux/uaccess.h> 35#include <linux/uaccess.h>
36#include <linux/pagemap.h>
35#include <linux/xattr.h> 37#include <linux/xattr.h>
36#include "smb2pdu.h" 38#include "smb2pdu.h"
37#include "cifsglob.h" 39#include "cifsglob.h"
@@ -42,6 +44,8 @@
42#include "cifs_debug.h" 44#include "cifs_debug.h"
43#include "ntlmssp.h" 45#include "ntlmssp.h"
44#include "smb2status.h" 46#include "smb2status.h"
47#include "smb2glob.h"
48#include "cifspdu.h"
45 49
46/* 50/*
47 * The following table defines the expected "StructureSize" of SMB2 requests 51 * The following table defines the expected "StructureSize" of SMB2 requests
@@ -115,9 +119,9 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
115 /* BB how does SMB2 do case sensitive? */ 119 /* BB how does SMB2 do case sensitive? */
116 /* if (tcon->nocase) 120 /* if (tcon->nocase)
117 hdr->Flags |= SMBFLG_CASELESS; */ 121 hdr->Flags |= SMBFLG_CASELESS; */
118 /* if (tcon->ses && tcon->ses->server && 122 if (tcon->ses && tcon->ses->server &&
119 (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) 123 (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED))
120 hdr->Flags |= SMB2_FLAGS_SIGNED; */ 124 hdr->Flags |= SMB2_FLAGS_SIGNED;
121out: 125out:
122 pdu->StructureSize2 = cpu_to_le16(parmsize); 126 pdu->StructureSize2 = cpu_to_le16(parmsize);
123 return; 127 return;
@@ -300,24 +304,6 @@ free_rsp_buf(int resp_buftype, void *rsp)
300 cifs_buf_release(rsp); 304 cifs_buf_release(rsp);
301} 305}
302 306
303#define SMB2_NUM_PROT 1
304
305#define SMB2_PROT 0
306#define SMB21_PROT 1
307#define BAD_PROT 0xFFFF
308
309#define SMB2_PROT_ID 0x0202
310#define SMB21_PROT_ID 0x0210
311#define BAD_PROT_ID 0xFFFF
312
313static struct {
314 int index;
315 __le16 name;
316} smb2protocols[] = {
317 {SMB2_PROT, cpu_to_le16(SMB2_PROT_ID)},
318 {SMB21_PROT, cpu_to_le16(SMB21_PROT_ID)},
319 {BAD_PROT, cpu_to_le16(BAD_PROT_ID)}
320};
321 307
322/* 308/*
323 * 309 *
@@ -344,7 +330,6 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
344 int resp_buftype; 330 int resp_buftype;
345 struct TCP_Server_Info *server; 331 struct TCP_Server_Info *server;
346 unsigned int sec_flags; 332 unsigned int sec_flags;
347 u16 i;
348 u16 temp = 0; 333 u16 temp = 0;
349 int blob_offset, blob_length; 334 int blob_offset, blob_length;
350 char *security_blob; 335 char *security_blob;
@@ -373,11 +358,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
373 358
374 req->hdr.SessionId = 0; 359 req->hdr.SessionId = 0;
375 360
376 for (i = 0; i < SMB2_NUM_PROT; i++) 361 req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
377 req->Dialects[i] = smb2protocols[i].name;
378 362
379 req->DialectCount = cpu_to_le16(i); 363 req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */
380 inc_rfc1001_len(req, i * 2); 364 inc_rfc1001_len(req, 2);
381 365
382 /* only one of SMB2 signing flags may be set in SMB2 request */ 366 /* only one of SMB2 signing flags may be set in SMB2 request */
383 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) 367 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
@@ -387,7 +371,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
387 371
388 req->SecurityMode = cpu_to_le16(temp); 372 req->SecurityMode = cpu_to_le16(temp);
389 373
390 req->Capabilities = cpu_to_le32(SMB2_GLOBAL_CAP_DFS); 374 req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
375
376 memcpy(req->ClientGUID, cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
391 377
392 iov[0].iov_base = (char *)req; 378 iov[0].iov_base = (char *)req;
393 /* 4 for rfc1002 length field */ 379 /* 4 for rfc1002 length field */
@@ -403,17 +389,16 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
403 if (rc != 0) 389 if (rc != 0)
404 goto neg_exit; 390 goto neg_exit;
405 391
406 if (rsp == NULL) {
407 rc = -EIO;
408 goto neg_exit;
409 }
410
411 cFYI(1, "mode 0x%x", rsp->SecurityMode); 392 cFYI(1, "mode 0x%x", rsp->SecurityMode);
412 393
413 if (rsp->DialectRevision == smb2protocols[SMB21_PROT].name) 394 /* BB we may eventually want to match the negotiated vs. requested
395 dialect, even though we are only requesting one at a time */
396 if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID))
397 cFYI(1, "negotiated smb2.0 dialect");
398 else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID))
414 cFYI(1, "negotiated smb2.1 dialect"); 399 cFYI(1, "negotiated smb2.1 dialect");
415 else if (rsp->DialectRevision == smb2protocols[SMB2_PROT].name) 400 else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
416 cFYI(1, "negotiated smb2 dialect"); 401 cFYI(1, "negotiated smb3.0 dialect");
417 else { 402 else {
418 cERROR(1, "Illegal dialect returned by server %d", 403 cERROR(1, "Illegal dialect returned by server %d",
419 le16_to_cpu(rsp->DialectRevision)); 404 le16_to_cpu(rsp->DialectRevision));
@@ -438,6 +423,38 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
438 rc = -EIO; 423 rc = -EIO;
439 goto neg_exit; 424 goto neg_exit;
440 } 425 }
426
427 cFYI(1, "sec_flags 0x%x", sec_flags);
428 if (sec_flags & CIFSSEC_MUST_SIGN) {
429 cFYI(1, "Signing required");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
432 cERROR(1, "signing required but server lacks support");
433 rc = -EOPNOTSUPP;
434 goto neg_exit;
435 }
436 server->sec_mode |= SECMODE_SIGN_REQUIRED;
437 } else if (sec_flags & CIFSSEC_MAY_SIGN) {
438 cFYI(1, "Signing optional");
439 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
440 cFYI(1, "Server requires signing");
441 server->sec_mode |= SECMODE_SIGN_REQUIRED;
442 } else {
443 server->sec_mode &=
444 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
445 }
446 } else {
447 cFYI(1, "Signing disabled");
448 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
449 cERROR(1, "Server requires packet signing to be enabled"
450 " in /proc/fs/cifs/SecurityFlags.");
451 rc = -EOPNOTSUPP;
452 goto neg_exit;
453 }
454 server->sec_mode &=
455 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
456 }
457
441#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ 458#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
442 rc = decode_neg_token_init(security_blob, blob_length, 459 rc = decode_neg_token_init(security_blob, blob_length,
443 &server->sec_type); 460 &server->sec_type);
@@ -599,13 +616,14 @@ ssetup_ntlmssp_authenticate:
599 616
600 kfree(security_blob); 617 kfree(security_blob);
601 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; 618 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
602 if (rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { 619 if (resp_buftype != CIFS_NO_BUFFER &&
620 rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
603 if (phase != NtLmNegotiate) { 621 if (phase != NtLmNegotiate) {
604 cERROR(1, "Unexpected more processing error"); 622 cERROR(1, "Unexpected more processing error");
605 goto ssetup_exit; 623 goto ssetup_exit;
606 } 624 }
607 if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != 625 if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
608 le16_to_cpu(rsp->SecurityBufferOffset)) { 626 le16_to_cpu(rsp->SecurityBufferOffset)) {
609 cERROR(1, "Invalid security buffer offset %d", 627 cERROR(1, "Invalid security buffer offset %d",
610 le16_to_cpu(rsp->SecurityBufferOffset)); 628 le16_to_cpu(rsp->SecurityBufferOffset));
611 rc = -EIO; 629 rc = -EIO;
@@ -631,11 +649,6 @@ ssetup_ntlmssp_authenticate:
631 if (rc != 0) 649 if (rc != 0)
632 goto ssetup_exit; 650 goto ssetup_exit;
633 651
634 if (rsp == NULL) {
635 rc = -EIO;
636 goto ssetup_exit;
637 }
638
639 ses->session_flags = le16_to_cpu(rsp->SessionFlags); 652 ses->session_flags = le16_to_cpu(rsp->SessionFlags);
640ssetup_exit: 653ssetup_exit:
641 free_rsp_buf(resp_buftype, rsp); 654 free_rsp_buf(resp_buftype, rsp);
@@ -666,6 +679,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
666 679
667 /* since no tcon, smb2_init can not do this, so do here */ 680 /* since no tcon, smb2_init can not do this, so do here */
668 req->hdr.SessionId = ses->Suid; 681 req->hdr.SessionId = ses->Suid;
682 if (server->sec_mode & SECMODE_SIGN_REQUIRED)
683 req->hdr.Flags |= SMB2_FLAGS_SIGNED;
669 684
670 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); 685 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
671 /* 686 /*
@@ -753,11 +768,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
753 goto tcon_error_exit; 768 goto tcon_error_exit;
754 } 769 }
755 770
756 if (rsp == NULL) {
757 rc = -EIO;
758 goto tcon_exit;
759 }
760
761 if (tcon == NULL) { 771 if (tcon == NULL) {
762 ses->ipc_tid = rsp->hdr.TreeId; 772 ses->ipc_tid = rsp->hdr.TreeId;
763 goto tcon_exit; 773 goto tcon_exit;
@@ -830,18 +840,87 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
830 return rc; 840 return rc;
831} 841}
832 842
843static struct create_lease *
844create_lease_buf(u8 *lease_key, u8 oplock)
845{
846 struct create_lease *buf;
847
848 buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL);
849 if (!buf)
850 return NULL;
851
852 memset(buf, 0, sizeof(struct create_lease));
853
854 buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
855 buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
856 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
857 buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING |
858 SMB2_LEASE_READ_CACHING;
859 else if (oplock == SMB2_OPLOCK_LEVEL_II)
860 buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING;
861 else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
862 buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING |
863 SMB2_LEASE_READ_CACHING |
864 SMB2_LEASE_WRITE_CACHING;
865
866 buf->ccontext.DataOffset = cpu_to_le16(offsetof
867 (struct create_lease, lcontext));
868 buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
869 buf->ccontext.NameOffset = cpu_to_le16(offsetof
870 (struct create_lease, Name));
871 buf->ccontext.NameLength = cpu_to_le16(4);
872 buf->Name[0] = 'R';
873 buf->Name[1] = 'q';
874 buf->Name[2] = 'L';
875 buf->Name[3] = 's';
876 return buf;
877}
878
879static __u8
880parse_lease_state(struct smb2_create_rsp *rsp)
881{
882 char *data_offset;
883 struct create_lease *lc;
884 bool found = false;
885
886 data_offset = (char *)rsp;
887 data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
888 lc = (struct create_lease *)data_offset;
889 do {
890 char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
891 if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
892 strncmp(name, "RqLs", 4)) {
893 lc = (struct create_lease *)((char *)lc
894 + le32_to_cpu(lc->ccontext.Next));
895 continue;
896 }
897 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
898 return SMB2_OPLOCK_LEVEL_NOCHANGE;
899 found = true;
900 break;
901 } while (le32_to_cpu(lc->ccontext.Next) != 0);
902
903 if (!found)
904 return 0;
905
906 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
907}
908
833int 909int
834SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, 910SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
835 u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access, 911 u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
836 __u32 create_disposition, __u32 file_attributes, __u32 create_options) 912 __u32 create_disposition, __u32 file_attributes, __u32 create_options,
913 __u8 *oplock, struct smb2_file_all_info *buf)
837{ 914{
838 struct smb2_create_req *req; 915 struct smb2_create_req *req;
839 struct smb2_create_rsp *rsp; 916 struct smb2_create_rsp *rsp;
840 struct TCP_Server_Info *server; 917 struct TCP_Server_Info *server;
841 struct cifs_ses *ses = tcon->ses; 918 struct cifs_ses *ses = tcon->ses;
842 struct kvec iov[2]; 919 struct kvec iov[3];
843 int resp_buftype; 920 int resp_buftype;
844 int uni_path_len; 921 int uni_path_len;
922 __le16 *copy_path = NULL;
923 int copy_size;
845 int rc = 0; 924 int rc = 0;
846 int num_iovecs = 2; 925 int num_iovecs = 2;
847 926
@@ -856,10 +935,6 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
856 if (rc) 935 if (rc)
857 return rc; 936 return rc;
858 937
859 if (enable_oplocks)
860 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_BATCH;
861 else
862 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_NONE;
863 req->ImpersonationLevel = IL_IMPERSONATION; 938 req->ImpersonationLevel = IL_IMPERSONATION;
864 req->DesiredAccess = cpu_to_le32(desired_access); 939 req->DesiredAccess = cpu_to_le32(desired_access);
865 /* File attributes ignored on open (used in create though) */ 940 /* File attributes ignored on open (used in create though) */
@@ -869,7 +944,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
869 req->CreateOptions = cpu_to_le32(create_options); 944 req->CreateOptions = cpu_to_le32(create_options);
870 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; 945 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
871 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) 946 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)
872 - 1 /* pad */ - 4 /* do not count rfc1001 len field */); 947 - 8 /* pad */ - 4 /* do not count rfc1001 len field */);
873 948
874 iov[0].iov_base = (char *)req; 949 iov[0].iov_base = (char *)req;
875 /* 4 for rfc1002 length field */ 950 /* 4 for rfc1002 length field */
@@ -880,6 +955,20 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
880 req->NameLength = cpu_to_le16(uni_path_len - 2); 955 req->NameLength = cpu_to_le16(uni_path_len - 2);
881 /* -1 since last byte is buf[0] which is sent below (path) */ 956 /* -1 since last byte is buf[0] which is sent below (path) */
882 iov[0].iov_len--; 957 iov[0].iov_len--;
958 if (uni_path_len % 8 != 0) {
959 copy_size = uni_path_len / 8 * 8;
960 if (copy_size < uni_path_len)
961 copy_size += 8;
962
963 copy_path = kzalloc(copy_size, GFP_KERNEL);
964 if (!copy_path)
965 return -ENOMEM;
966 memcpy((char *)copy_path, (const char *)path,
967 uni_path_len);
968 uni_path_len = copy_size;
969 path = copy_path;
970 }
971
883 iov[1].iov_len = uni_path_len; 972 iov[1].iov_len = uni_path_len;
884 iov[1].iov_base = path; 973 iov[1].iov_base = path;
885 /* 974 /*
@@ -888,10 +977,37 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
888 */ 977 */
889 inc_rfc1001_len(req, uni_path_len - 1); 978 inc_rfc1001_len(req, uni_path_len - 1);
890 } else { 979 } else {
980 iov[0].iov_len += 7;
981 req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
982 req->hdr.smb2_buf_length) + 8 - 1);
891 num_iovecs = 1; 983 num_iovecs = 1;
892 req->NameLength = 0; 984 req->NameLength = 0;
893 } 985 }
894 986
987 if (!server->oplocks)
988 *oplock = SMB2_OPLOCK_LEVEL_NONE;
989
990 if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
991 *oplock == SMB2_OPLOCK_LEVEL_NONE)
992 req->RequestedOplockLevel = *oplock;
993 else {
994 iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock);
995 if (iov[num_iovecs].iov_base == NULL) {
996 cifs_small_buf_release(req);
997 kfree(copy_path);
998 return -ENOMEM;
999 }
1000 iov[num_iovecs].iov_len = sizeof(struct create_lease);
1001 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
1002 req->CreateContextsOffset = cpu_to_le32(
1003 sizeof(struct smb2_create_req) - 4 - 8 +
1004 iov[num_iovecs-1].iov_len);
1005 req->CreateContextsLength = cpu_to_le32(
1006 sizeof(struct create_lease));
1007 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
1008 num_iovecs++;
1009 }
1010
895 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); 1011 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
896 rsp = (struct smb2_create_rsp *)iov[0].iov_base; 1012 rsp = (struct smb2_create_rsp *)iov[0].iov_base;
897 1013
@@ -900,13 +1016,24 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
900 goto creat_exit; 1016 goto creat_exit;
901 } 1017 }
902 1018
903 if (rsp == NULL) {
904 rc = -EIO;
905 goto creat_exit;
906 }
907 *persistent_fid = rsp->PersistentFileId; 1019 *persistent_fid = rsp->PersistentFileId;
908 *volatile_fid = rsp->VolatileFileId; 1020 *volatile_fid = rsp->VolatileFileId;
1021
1022 if (buf) {
1023 memcpy(buf, &rsp->CreationTime, 32);
1024 buf->AllocationSize = rsp->AllocationSize;
1025 buf->EndOfFile = rsp->EndofFile;
1026 buf->Attributes = rsp->FileAttributes;
1027 buf->NumberOfLinks = cpu_to_le32(1);
1028 buf->DeletePending = 0;
1029 }
1030
1031 if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
1032 *oplock = parse_lease_state(rsp);
1033 else
1034 *oplock = rsp->OplockLevel;
909creat_exit: 1035creat_exit:
1036 kfree(copy_path);
910 free_rsp_buf(resp_buftype, rsp); 1037 free_rsp_buf(resp_buftype, rsp);
911 return rc; 1038 return rc;
912} 1039}
@@ -950,11 +1077,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
950 goto close_exit; 1077 goto close_exit;
951 } 1078 }
952 1079
953 if (rsp == NULL) {
954 rc = -EIO;
955 goto close_exit;
956 }
957
958 /* BB FIXME - decode close response, update inode for caching */ 1080 /* BB FIXME - decode close response, update inode for caching */
959 1081
960close_exit: 1082close_exit:
@@ -1019,10 +1141,10 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
1019 return 0; 1141 return 0;
1020} 1142}
1021 1143
1022int 1144static int
1023SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, 1145query_info(const unsigned int xid, struct cifs_tcon *tcon,
1024 u64 persistent_fid, u64 volatile_fid, 1146 u64 persistent_fid, u64 volatile_fid, u8 info_class,
1025 struct smb2_file_all_info *data) 1147 size_t output_len, size_t min_len, void *data)
1026{ 1148{
1027 struct smb2_query_info_req *req; 1149 struct smb2_query_info_req *req;
1028 struct smb2_query_info_rsp *rsp = NULL; 1150 struct smb2_query_info_rsp *rsp = NULL;
@@ -1044,37 +1166,56 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
1044 return rc; 1166 return rc;
1045 1167
1046 req->InfoType = SMB2_O_INFO_FILE; 1168 req->InfoType = SMB2_O_INFO_FILE;
1047 req->FileInfoClass = FILE_ALL_INFORMATION; 1169 req->FileInfoClass = info_class;
1048 req->PersistentFileId = persistent_fid; 1170 req->PersistentFileId = persistent_fid;
1049 req->VolatileFileId = volatile_fid; 1171 req->VolatileFileId = volatile_fid;
1050 /* 4 for rfc1002 length field and 1 for Buffer */ 1172 /* 4 for rfc1002 length field and 1 for Buffer */
1051 req->InputBufferOffset = 1173 req->InputBufferOffset =
1052 cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); 1174 cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
1053 req->OutputBufferLength = 1175 req->OutputBufferLength = cpu_to_le32(output_len);
1054 cpu_to_le32(sizeof(struct smb2_file_all_info) + MAX_NAME * 2);
1055 1176
1056 iov[0].iov_base = (char *)req; 1177 iov[0].iov_base = (char *)req;
1057 /* 4 for rfc1002 length field */ 1178 /* 4 for rfc1002 length field */
1058 iov[0].iov_len = get_rfc1002_length(req) + 4; 1179 iov[0].iov_len = get_rfc1002_length(req) + 4;
1059 1180
1060 rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); 1181 rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
1182 rsp = (struct smb2_query_info_rsp *)iov[0].iov_base;
1183
1061 if (rc) { 1184 if (rc) {
1062 cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); 1185 cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
1063 goto qinf_exit; 1186 goto qinf_exit;
1064 } 1187 }
1065 1188
1066 rsp = (struct smb2_query_info_rsp *)iov[0].iov_base;
1067
1068 rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), 1189 rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
1069 le32_to_cpu(rsp->OutputBufferLength), 1190 le32_to_cpu(rsp->OutputBufferLength),
1070 &rsp->hdr, sizeof(struct smb2_file_all_info), 1191 &rsp->hdr, min_len, data);
1071 (char *)data);
1072 1192
1073qinf_exit: 1193qinf_exit:
1074 free_rsp_buf(resp_buftype, rsp); 1194 free_rsp_buf(resp_buftype, rsp);
1075 return rc; 1195 return rc;
1076} 1196}
1077 1197
1198int
1199SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
1200 u64 persistent_fid, u64 volatile_fid,
1201 struct smb2_file_all_info *data)
1202{
1203 return query_info(xid, tcon, persistent_fid, volatile_fid,
1204 FILE_ALL_INFORMATION,
1205 sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
1206 sizeof(struct smb2_file_all_info), data);
1207}
1208
1209int
1210SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
1211 u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid)
1212{
1213 return query_info(xid, tcon, persistent_fid, volatile_fid,
1214 FILE_INTERNAL_INFORMATION,
1215 sizeof(struct smb2_file_internal_info),
1216 sizeof(struct smb2_file_internal_info), uniqueid);
1217}
1218
1078/* 1219/*
1079 * This is a no-op for now. We're not really interested in the reply, but 1220 * This is a no-op for now. We're not really interested in the reply, but
1080 * rather in the fact that the server sent one and that server->lstrp 1221 * rather in the fact that the server sent one and that server->lstrp
@@ -1102,6 +1243,8 @@ SMB2_echo(struct TCP_Server_Info *server)
1102 struct smb2_echo_req *req; 1243 struct smb2_echo_req *req;
1103 int rc = 0; 1244 int rc = 0;
1104 struct kvec iov; 1245 struct kvec iov;
1246 struct smb_rqst rqst = { .rq_iov = &iov,
1247 .rq_nvec = 1 };
1105 1248
1106 cFYI(1, "In echo request"); 1249 cFYI(1, "In echo request");
1107 1250
@@ -1115,7 +1258,7 @@ SMB2_echo(struct TCP_Server_Info *server)
1115 /* 4 for rfc1002 length field */ 1258 /* 4 for rfc1002 length field */
1116 iov.iov_len = get_rfc1002_length(req) + 4; 1259 iov.iov_len = get_rfc1002_length(req) + 4;
1117 1260
1118 rc = cifs_call_async(server, &iov, 1, NULL, smb2_echo_callback, server, 1261 rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
1119 CIFS_ECHO_OP); 1262 CIFS_ECHO_OP);
1120 if (rc) 1263 if (rc)
1121 cFYI(1, "Echo request failed: %d", rc); 1264 cFYI(1, "Echo request failed: %d", rc);
@@ -1123,3 +1266,945 @@ SMB2_echo(struct TCP_Server_Info *server)
1123 cifs_small_buf_release(req); 1266 cifs_small_buf_release(req);
1124 return rc; 1267 return rc;
1125} 1268}
1269
1270int
1271SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1272 u64 volatile_fid)
1273{
1274 struct smb2_flush_req *req;
1275 struct TCP_Server_Info *server;
1276 struct cifs_ses *ses = tcon->ses;
1277 struct kvec iov[1];
1278 int resp_buftype;
1279 int rc = 0;
1280
1281 cFYI(1, "Flush");
1282
1283 if (ses && (ses->server))
1284 server = ses->server;
1285 else
1286 return -EIO;
1287
1288 rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req);
1289 if (rc)
1290 return rc;
1291
1292 req->PersistentFileId = persistent_fid;
1293 req->VolatileFileId = volatile_fid;
1294
1295 iov[0].iov_base = (char *)req;
1296 /* 4 for rfc1002 length field */
1297 iov[0].iov_len = get_rfc1002_length(req) + 4;
1298
1299 rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
1300
1301 if ((rc != 0) && tcon)
1302 cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
1303
1304 free_rsp_buf(resp_buftype, iov[0].iov_base);
1305 return rc;
1306}
1307
1308/*
1309 * To form a chain of read requests, any read requests after the first should
1310 * have the end_of_chain boolean set to true.
1311 */
1312static int
1313smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
1314 unsigned int remaining_bytes, int request_type)
1315{
1316 int rc = -EACCES;
1317 struct smb2_read_req *req = NULL;
1318
1319 rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req);
1320 if (rc)
1321 return rc;
1322 if (io_parms->tcon->ses->server == NULL)
1323 return -ECONNABORTED;
1324
1325 req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
1326
1327 req->PersistentFileId = io_parms->persistent_fid;
1328 req->VolatileFileId = io_parms->volatile_fid;
1329 req->ReadChannelInfoOffset = 0; /* reserved */
1330 req->ReadChannelInfoLength = 0; /* reserved */
1331 req->Channel = 0; /* reserved */
1332 req->MinimumCount = 0;
1333 req->Length = cpu_to_le32(io_parms->length);
1334 req->Offset = cpu_to_le64(io_parms->offset);
1335
1336 if (request_type & CHAINED_REQUEST) {
1337 if (!(request_type & END_OF_CHAIN)) {
1338 /* 4 for rfc1002 length field */
1339 req->hdr.NextCommand =
1340 cpu_to_le32(get_rfc1002_length(req) + 4);
1341 } else /* END_OF_CHAIN */
1342 req->hdr.NextCommand = 0;
1343 if (request_type & RELATED_REQUEST) {
1344 req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
1345 /*
1346 * Related requests use info from previous read request
1347 * in chain.
1348 */
1349 req->hdr.SessionId = 0xFFFFFFFF;
1350 req->hdr.TreeId = 0xFFFFFFFF;
1351 req->PersistentFileId = 0xFFFFFFFF;
1352 req->VolatileFileId = 0xFFFFFFFF;
1353 }
1354 }
1355 if (remaining_bytes > io_parms->length)
1356 req->RemainingBytes = cpu_to_le32(remaining_bytes);
1357 else
1358 req->RemainingBytes = 0;
1359
1360 iov[0].iov_base = (char *)req;
1361 /* 4 for rfc1002 length field */
1362 iov[0].iov_len = get_rfc1002_length(req) + 4;
1363 return rc;
1364}
1365
1366static void
1367smb2_readv_callback(struct mid_q_entry *mid)
1368{
1369 struct cifs_readdata *rdata = mid->callback_data;
1370 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1371 struct TCP_Server_Info *server = tcon->ses->server;
1372 struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base;
1373 unsigned int credits_received = 1;
1374 struct smb_rqst rqst = { .rq_iov = &rdata->iov,
1375 .rq_nvec = 1,
1376 .rq_pages = rdata->pages,
1377 .rq_npages = rdata->nr_pages,
1378 .rq_pagesz = rdata->pagesz,
1379 .rq_tailsz = rdata->tailsz };
1380
1381 cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
1382 mid->mid, mid->mid_state, rdata->result, rdata->bytes);
1383
1384 switch (mid->mid_state) {
1385 case MID_RESPONSE_RECEIVED:
1386 credits_received = le16_to_cpu(buf->CreditRequest);
1387 /* result already set, check signature */
1388 if (server->sec_mode &
1389 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1390 int rc;
1391
1392 rc = smb2_verify_signature(&rqst, server);
1393 if (rc)
1394 cERROR(1, "SMB signature verification returned "
1395 "error = %d", rc);
1396 }
1397 /* FIXME: should this be counted toward the initiating task? */
1398 task_io_account_read(rdata->bytes);
1399 cifs_stats_bytes_read(tcon, rdata->bytes);
1400 break;
1401 case MID_REQUEST_SUBMITTED:
1402 case MID_RETRY_NEEDED:
1403 rdata->result = -EAGAIN;
1404 break;
1405 default:
1406 if (rdata->result != -ENODATA)
1407 rdata->result = -EIO;
1408 }
1409
1410 if (rdata->result)
1411 cifs_stats_fail_inc(tcon, SMB2_READ_HE);
1412
1413 queue_work(cifsiod_wq, &rdata->work);
1414 DeleteMidQEntry(mid);
1415 add_credits(server, credits_received, 0);
1416}
1417
1418/* smb2_async_readv - send an async write, and set up mid to handle result */
1419int
1420smb2_async_readv(struct cifs_readdata *rdata)
1421{
1422 int rc;
1423 struct smb2_hdr *buf;
1424 struct cifs_io_parms io_parms;
1425 struct smb_rqst rqst = { .rq_iov = &rdata->iov,
1426 .rq_nvec = 1 };
1427
1428 cFYI(1, "%s: offset=%llu bytes=%u", __func__,
1429 rdata->offset, rdata->bytes);
1430
1431 io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
1432 io_parms.offset = rdata->offset;
1433 io_parms.length = rdata->bytes;
1434 io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
1435 io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
1436 io_parms.pid = rdata->pid;
1437 rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
1438 if (rc)
1439 return rc;
1440
1441 buf = (struct smb2_hdr *)rdata->iov.iov_base;
1442 /* 4 for rfc1002 length field */
1443 rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
1444
1445 kref_get(&rdata->refcount);
1446 rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
1447 cifs_readv_receive, smb2_readv_callback,
1448 rdata, 0);
1449 if (rc) {
1450 kref_put(&rdata->refcount, cifs_readdata_release);
1451 cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
1452 }
1453
1454 cifs_small_buf_release(buf);
1455 return rc;
1456}
1457
1458int
1459SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
1460 unsigned int *nbytes, char **buf, int *buf_type)
1461{
1462 int resp_buftype, rc = -EACCES;
1463 struct smb2_read_rsp *rsp = NULL;
1464 struct kvec iov[1];
1465
1466 *nbytes = 0;
1467 rc = smb2_new_read_req(iov, io_parms, 0, 0);
1468 if (rc)
1469 return rc;
1470
1471 rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1,
1472 &resp_buftype, CIFS_LOG_ERROR);
1473
1474 rsp = (struct smb2_read_rsp *)iov[0].iov_base;
1475
1476 if (rsp->hdr.Status == STATUS_END_OF_FILE) {
1477 free_rsp_buf(resp_buftype, iov[0].iov_base);
1478 return 0;
1479 }
1480
1481 if (rc) {
1482 cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
1483 cERROR(1, "Send error in read = %d", rc);
1484 } else {
1485 *nbytes = le32_to_cpu(rsp->DataLength);
1486 if ((*nbytes > CIFS_MAX_MSGSIZE) ||
1487 (*nbytes > io_parms->length)) {
1488 cFYI(1, "bad length %d for count %d", *nbytes,
1489 io_parms->length);
1490 rc = -EIO;
1491 *nbytes = 0;
1492 }
1493 }
1494
1495 if (*buf) {
1496 memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
1497 *nbytes);
1498 free_rsp_buf(resp_buftype, iov[0].iov_base);
1499 } else if (resp_buftype != CIFS_NO_BUFFER) {
1500 *buf = iov[0].iov_base;
1501 if (resp_buftype == CIFS_SMALL_BUFFER)
1502 *buf_type = CIFS_SMALL_BUFFER;
1503 else if (resp_buftype == CIFS_LARGE_BUFFER)
1504 *buf_type = CIFS_LARGE_BUFFER;
1505 }
1506 return rc;
1507}
1508
1509/*
1510 * Check the mid_state and signature on received buffer (if any), and queue the
1511 * workqueue completion task.
1512 */
1513static void
1514smb2_writev_callback(struct mid_q_entry *mid)
1515{
1516 struct cifs_writedata *wdata = mid->callback_data;
1517 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
1518 unsigned int written;
1519 struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
1520 unsigned int credits_received = 1;
1521
1522 switch (mid->mid_state) {
1523 case MID_RESPONSE_RECEIVED:
1524 credits_received = le16_to_cpu(rsp->hdr.CreditRequest);
1525 wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
1526 if (wdata->result != 0)
1527 break;
1528
1529 written = le32_to_cpu(rsp->DataLength);
1530 /*
1531 * Mask off high 16 bits when bytes written as returned
1532 * by the server is greater than bytes requested by the
1533 * client. OS/2 servers are known to set incorrect
1534 * CountHigh values.
1535 */
1536 if (written > wdata->bytes)
1537 written &= 0xFFFF;
1538
1539 if (written < wdata->bytes)
1540 wdata->result = -ENOSPC;
1541 else
1542 wdata->bytes = written;
1543 break;
1544 case MID_REQUEST_SUBMITTED:
1545 case MID_RETRY_NEEDED:
1546 wdata->result = -EAGAIN;
1547 break;
1548 default:
1549 wdata->result = -EIO;
1550 break;
1551 }
1552
1553 if (wdata->result)
1554 cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
1555
1556 queue_work(cifsiod_wq, &wdata->work);
1557 DeleteMidQEntry(mid);
1558 add_credits(tcon->ses->server, credits_received, 0);
1559}
1560
1561/* smb2_async_writev - send an async write, and set up mid to handle result */
1562int
1563smb2_async_writev(struct cifs_writedata *wdata)
1564{
1565 int rc = -EACCES;
1566 struct smb2_write_req *req = NULL;
1567 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
1568 struct kvec iov;
1569 struct smb_rqst rqst;
1570
1571 rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
1572 if (rc)
1573 goto async_writev_out;
1574
1575 req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
1576
1577 req->PersistentFileId = wdata->cfile->fid.persistent_fid;
1578 req->VolatileFileId = wdata->cfile->fid.volatile_fid;
1579 req->WriteChannelInfoOffset = 0;
1580 req->WriteChannelInfoLength = 0;
1581 req->Channel = 0;
1582 req->Offset = cpu_to_le64(wdata->offset);
1583 /* 4 for rfc1002 length field */
1584 req->DataOffset = cpu_to_le16(
1585 offsetof(struct smb2_write_req, Buffer) - 4);
1586 req->RemainingBytes = 0;
1587
1588 /* 4 for rfc1002 length field and 1 for Buffer */
1589 iov.iov_len = get_rfc1002_length(req) + 4 - 1;
1590 iov.iov_base = req;
1591
1592 rqst.rq_iov = &iov;
1593 rqst.rq_nvec = 1;
1594 rqst.rq_pages = wdata->pages;
1595 rqst.rq_npages = wdata->nr_pages;
1596 rqst.rq_pagesz = wdata->pagesz;
1597 rqst.rq_tailsz = wdata->tailsz;
1598
1599 cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
1600
1601 req->Length = cpu_to_le32(wdata->bytes);
1602
1603 inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
1604
1605 kref_get(&wdata->refcount);
1606 rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
1607 smb2_writev_callback, wdata, 0);
1608
1609 if (rc) {
1610 kref_put(&wdata->refcount, cifs_writedata_release);
1611 cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
1612 }
1613
1614async_writev_out:
1615 cifs_small_buf_release(req);
1616 return rc;
1617}
1618
1619/*
1620 * SMB2_write function gets iov pointer to kvec array with n_vec as a length.
1621 * The length field from io_parms must be at least 1 and indicates a number of
1622 * elements with data to write that begins with position 1 in iov array. All
1623 * data length is specified by count.
1624 */
1625int
1626SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
1627 unsigned int *nbytes, struct kvec *iov, int n_vec)
1628{
1629 int rc = 0;
1630 struct smb2_write_req *req = NULL;
1631 struct smb2_write_rsp *rsp = NULL;
1632 int resp_buftype;
1633 *nbytes = 0;
1634
1635 if (n_vec < 1)
1636 return rc;
1637
1638 rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req);
1639 if (rc)
1640 return rc;
1641
1642 if (io_parms->tcon->ses->server == NULL)
1643 return -ECONNABORTED;
1644
1645 req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
1646
1647 req->PersistentFileId = io_parms->persistent_fid;
1648 req->VolatileFileId = io_parms->volatile_fid;
1649 req->WriteChannelInfoOffset = 0;
1650 req->WriteChannelInfoLength = 0;
1651 req->Channel = 0;
1652 req->Length = cpu_to_le32(io_parms->length);
1653 req->Offset = cpu_to_le64(io_parms->offset);
1654 /* 4 for rfc1002 length field */
1655 req->DataOffset = cpu_to_le16(
1656 offsetof(struct smb2_write_req, Buffer) - 4);
1657 req->RemainingBytes = 0;
1658
1659 iov[0].iov_base = (char *)req;
1660 /* 4 for rfc1002 length field and 1 for Buffer */
1661 iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
1662
1663 /* length of entire message including data to be written */
1664 inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */);
1665
1666 rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1,
1667 &resp_buftype, 0);
1668 rsp = (struct smb2_write_rsp *)iov[0].iov_base;
1669
1670 if (rc) {
1671 cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
1672 cERROR(1, "Send error in write = %d", rc);
1673 } else
1674 *nbytes = le32_to_cpu(rsp->DataLength);
1675
1676 free_rsp_buf(resp_buftype, rsp);
1677 return rc;
1678}
1679
1680static unsigned int
1681num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
1682{
1683 int len;
1684 unsigned int entrycount = 0;
1685 unsigned int next_offset = 0;
1686 FILE_DIRECTORY_INFO *entryptr;
1687
1688 if (bufstart == NULL)
1689 return 0;
1690
1691 entryptr = (FILE_DIRECTORY_INFO *)bufstart;
1692
1693 while (1) {
1694 entryptr = (FILE_DIRECTORY_INFO *)
1695 ((char *)entryptr + next_offset);
1696
1697 if ((char *)entryptr + size > end_of_buf) {
1698 cERROR(1, "malformed search entry would overflow");
1699 break;
1700 }
1701
1702 len = le32_to_cpu(entryptr->FileNameLength);
1703 if ((char *)entryptr + len + size > end_of_buf) {
1704 cERROR(1, "directory entry name would overflow frame "
1705 "end of buf %p", end_of_buf);
1706 break;
1707 }
1708
1709 *lastentry = (char *)entryptr;
1710 entrycount++;
1711
1712 next_offset = le32_to_cpu(entryptr->NextEntryOffset);
1713 if (!next_offset)
1714 break;
1715 }
1716
1717 return entrycount;
1718}
1719
1720/*
1721 * Readdir/FindFirst
1722 */
1723int
1724SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
1725 u64 persistent_fid, u64 volatile_fid, int index,
1726 struct cifs_search_info *srch_inf)
1727{
1728 struct smb2_query_directory_req *req;
1729 struct smb2_query_directory_rsp *rsp = NULL;
1730 struct kvec iov[2];
1731 int rc = 0;
1732 int len;
1733 int resp_buftype;
1734 unsigned char *bufptr;
1735 struct TCP_Server_Info *server;
1736 struct cifs_ses *ses = tcon->ses;
1737 __le16 asteriks = cpu_to_le16('*');
1738 char *end_of_smb;
1739 unsigned int output_size = CIFSMaxBufSize;
1740 size_t info_buf_size;
1741
1742 if (ses && (ses->server))
1743 server = ses->server;
1744 else
1745 return -EIO;
1746
1747 rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req);
1748 if (rc)
1749 return rc;
1750
1751 switch (srch_inf->info_level) {
1752 case SMB_FIND_FILE_DIRECTORY_INFO:
1753 req->FileInformationClass = FILE_DIRECTORY_INFORMATION;
1754 info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
1755 break;
1756 case SMB_FIND_FILE_ID_FULL_DIR_INFO:
1757 req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
1758 info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
1759 break;
1760 default:
1761 cERROR(1, "info level %u isn't supported",
1762 srch_inf->info_level);
1763 rc = -EINVAL;
1764 goto qdir_exit;
1765 }
1766
1767 req->FileIndex = cpu_to_le32(index);
1768 req->PersistentFileId = persistent_fid;
1769 req->VolatileFileId = volatile_fid;
1770
1771 len = 0x2;
1772 bufptr = req->Buffer;
1773 memcpy(bufptr, &asteriks, len);
1774
1775 req->FileNameOffset =
1776 cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4);
1777 req->FileNameLength = cpu_to_le16(len);
1778 /*
1779 * BB could be 30 bytes or so longer if we used SMB2 specific
1780 * buffer lengths, but this is safe and close enough.
1781 */
1782 output_size = min_t(unsigned int, output_size, server->maxBuf);
1783 output_size = min_t(unsigned int, output_size, 2 << 15);
1784 req->OutputBufferLength = cpu_to_le32(output_size);
1785
1786 iov[0].iov_base = (char *)req;
1787 /* 4 for RFC1001 length and 1 for Buffer */
1788 iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
1789
1790 iov[1].iov_base = (char *)(req->Buffer);
1791 iov[1].iov_len = len;
1792
1793 inc_rfc1001_len(req, len - 1 /* Buffer */);
1794
1795 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
1796 rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
1797
1798 if (rc) {
1799 cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
1800 goto qdir_exit;
1801 }
1802
1803 rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
1804 le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
1805 info_buf_size);
1806 if (rc)
1807 goto qdir_exit;
1808
1809 srch_inf->unicode = true;
1810
1811 if (srch_inf->ntwrk_buf_start) {
1812 if (srch_inf->smallBuf)
1813 cifs_small_buf_release(srch_inf->ntwrk_buf_start);
1814 else
1815 cifs_buf_release(srch_inf->ntwrk_buf_start);
1816 }
1817 srch_inf->ntwrk_buf_start = (char *)rsp;
1818 srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
1819 (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
1820 /* 4 for rfc1002 length field */
1821 end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
1822 srch_inf->entries_in_buffer =
1823 num_entries(srch_inf->srch_entries_start, end_of_smb,
1824 &srch_inf->last_entry, info_buf_size);
1825 srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
1826 cFYI(1, "num entries %d last_index %lld srch start %p srch end %p",
1827 srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
1828 srch_inf->srch_entries_start, srch_inf->last_entry);
1829 if (resp_buftype == CIFS_LARGE_BUFFER)
1830 srch_inf->smallBuf = false;
1831 else if (resp_buftype == CIFS_SMALL_BUFFER)
1832 srch_inf->smallBuf = true;
1833 else
1834 cERROR(1, "illegal search buffer type");
1835
1836 if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
1837 srch_inf->endOfSearch = 1;
1838 else
1839 srch_inf->endOfSearch = 0;
1840
1841 return rc;
1842
1843qdir_exit:
1844 free_rsp_buf(resp_buftype, rsp);
1845 return rc;
1846}
1847
1848static int
1849send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
1850 u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class,
1851 unsigned int num, void **data, unsigned int *size)
1852{
1853 struct smb2_set_info_req *req;
1854 struct smb2_set_info_rsp *rsp = NULL;
1855 struct kvec *iov;
1856 int rc = 0;
1857 int resp_buftype;
1858 unsigned int i;
1859 struct TCP_Server_Info *server;
1860 struct cifs_ses *ses = tcon->ses;
1861
1862 if (ses && (ses->server))
1863 server = ses->server;
1864 else
1865 return -EIO;
1866
1867 if (!num)
1868 return -EINVAL;
1869
1870 iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL);
1871 if (!iov)
1872 return -ENOMEM;
1873
1874 rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req);
1875 if (rc) {
1876 kfree(iov);
1877 return rc;
1878 }
1879
1880 req->hdr.ProcessId = cpu_to_le32(pid);
1881
1882 req->InfoType = SMB2_O_INFO_FILE;
1883 req->FileInfoClass = info_class;
1884 req->PersistentFileId = persistent_fid;
1885 req->VolatileFileId = volatile_fid;
1886
1887 /* 4 for RFC1001 length and 1 for Buffer */
1888 req->BufferOffset =
1889 cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4);
1890 req->BufferLength = cpu_to_le32(*size);
1891
1892 inc_rfc1001_len(req, *size - 1 /* Buffer */);
1893
1894 memcpy(req->Buffer, *data, *size);
1895
1896 iov[0].iov_base = (char *)req;
1897 /* 4 for RFC1001 length */
1898 iov[0].iov_len = get_rfc1002_length(req) + 4;
1899
1900 for (i = 1; i < num; i++) {
1901 inc_rfc1001_len(req, size[i]);
1902 le32_add_cpu(&req->BufferLength, size[i]);
1903 iov[i].iov_base = (char *)data[i];
1904 iov[i].iov_len = size[i];
1905 }
1906
1907 rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
1908 rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
1909
1910 if (rc != 0) {
1911 cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
1912 goto out;
1913 }
1914out:
1915 free_rsp_buf(resp_buftype, rsp);
1916 kfree(iov);
1917 return rc;
1918}
1919
1920int
1921SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
1922 u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
1923{
1924 struct smb2_file_rename_info info;
1925 void **data;
1926 unsigned int size[2];
1927 int rc;
1928 int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
1929
1930 data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
1931 if (!data)
1932 return -ENOMEM;
1933
1934 info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
1935 /* 0 = fail if target already exists */
1936 info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
1937 info.FileNameLength = cpu_to_le32(len);
1938
1939 data[0] = &info;
1940 size[0] = sizeof(struct smb2_file_rename_info);
1941
1942 data[1] = target_file;
1943 size[1] = len + 2 /* null */;
1944
1945 rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
1946 current->tgid, FILE_RENAME_INFORMATION, 2, data,
1947 size);
1948 kfree(data);
1949 return rc;
1950}
1951
1952int
1953SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
1954 u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
1955{
1956 struct smb2_file_link_info info;
1957 void **data;
1958 unsigned int size[2];
1959 int rc;
1960 int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
1961
1962 data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
1963 if (!data)
1964 return -ENOMEM;
1965
1966 info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
1967 /* 0 = fail if link already exists */
1968 info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
1969 info.FileNameLength = cpu_to_le32(len);
1970
1971 data[0] = &info;
1972 size[0] = sizeof(struct smb2_file_link_info);
1973
1974 data[1] = target_file;
1975 size[1] = len + 2 /* null */;
1976
1977 rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
1978 current->tgid, FILE_LINK_INFORMATION, 2, data, size);
1979 kfree(data);
1980 return rc;
1981}
1982
1983int
1984SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1985 u64 volatile_fid, u32 pid, __le64 *eof)
1986{
1987 struct smb2_file_eof_info info;
1988 void *data;
1989 unsigned int size;
1990
1991 info.EndOfFile = *eof;
1992
1993 data = &info;
1994 size = sizeof(struct smb2_file_eof_info);
1995
1996 return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid,
1997 FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
1998}
1999
2000int
2001SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
2002 u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
2003{
2004 unsigned int size;
2005 size = sizeof(FILE_BASIC_INFO);
2006 return send_set_info(xid, tcon, persistent_fid, volatile_fid,
2007 current->tgid, FILE_BASIC_INFORMATION, 1,
2008 (void **)&buf, &size);
2009}
2010
2011int
2012SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
2013 const u64 persistent_fid, const u64 volatile_fid,
2014 __u8 oplock_level)
2015{
2016 int rc;
2017 struct smb2_oplock_break *req = NULL;
2018
2019 cFYI(1, "SMB2_oplock_break");
2020 rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
2021
2022 if (rc)
2023 return rc;
2024
2025 req->VolatileFid = volatile_fid;
2026 req->PersistentFid = persistent_fid;
2027 req->OplockLevel = oplock_level;
2028 req->hdr.CreditRequest = cpu_to_le16(1);
2029
2030 rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
2031 /* SMB2 buffer freed by function above */
2032
2033 if (rc) {
2034 cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
2035 cFYI(1, "Send error in Oplock Break = %d", rc);
2036 }
2037
2038 return rc;
2039}
2040
2041static void
2042copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf,
2043 struct kstatfs *kst)
2044{
2045 kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) *
2046 le32_to_cpu(pfs_inf->SectorsPerAllocationUnit);
2047 kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits);
2048 kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits);
2049 kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits);
2050 return;
2051}
2052
2053static int
2054build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
2055 int outbuf_len, u64 persistent_fid, u64 volatile_fid)
2056{
2057 int rc;
2058 struct smb2_query_info_req *req;
2059
2060 cFYI(1, "Query FSInfo level %d", level);
2061
2062 if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
2063 return -EIO;
2064
2065 rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
2066 if (rc)
2067 return rc;
2068
2069 req->InfoType = SMB2_O_INFO_FILESYSTEM;
2070 req->FileInfoClass = level;
2071 req->PersistentFileId = persistent_fid;
2072 req->VolatileFileId = volatile_fid;
2073 /* 4 for rfc1002 length field and 1 for pad */
2074 req->InputBufferOffset =
2075 cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
2076 req->OutputBufferLength = cpu_to_le32(
2077 outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4);
2078
2079 iov->iov_base = (char *)req;
2080 /* 4 for rfc1002 length field */
2081 iov->iov_len = get_rfc1002_length(req) + 4;
2082 return 0;
2083}
2084
2085int
2086SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
2087 u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
2088{
2089 struct smb2_query_info_rsp *rsp = NULL;
2090 struct kvec iov;
2091 int rc = 0;
2092 int resp_buftype;
2093 struct cifs_ses *ses = tcon->ses;
2094 struct smb2_fs_full_size_info *info = NULL;
2095
2096 rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION,
2097 sizeof(struct smb2_fs_full_size_info),
2098 persistent_fid, volatile_fid);
2099 if (rc)
2100 return rc;
2101
2102 rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
2103 if (rc) {
2104 cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
2105 goto qinf_exit;
2106 }
2107 rsp = (struct smb2_query_info_rsp *)iov.iov_base;
2108
2109 info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ +
2110 le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
2111 rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
2112 le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
2113 sizeof(struct smb2_fs_full_size_info));
2114 if (!rc)
2115 copy_fs_info_to_kstatfs(info, fsdata);
2116
2117qinf_exit:
2118 free_rsp_buf(resp_buftype, iov.iov_base);
2119 return rc;
2120}
2121
2122int
2123smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
2124 const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
2125 const __u32 num_lock, struct smb2_lock_element *buf)
2126{
2127 int rc = 0;
2128 struct smb2_lock_req *req = NULL;
2129 struct kvec iov[2];
2130 int resp_buf_type;
2131 unsigned int count;
2132
2133 cFYI(1, "smb2_lockv num lock %d", num_lock);
2134
2135 rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
2136 if (rc)
2137 return rc;
2138
2139 req->hdr.ProcessId = cpu_to_le32(pid);
2140 req->LockCount = cpu_to_le16(num_lock);
2141
2142 req->PersistentFileId = persist_fid;
2143 req->VolatileFileId = volatile_fid;
2144
2145 count = num_lock * sizeof(struct smb2_lock_element);
2146 inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element));
2147
2148 iov[0].iov_base = (char *)req;
2149 /* 4 for rfc1002 length field and count for all locks */
2150 iov[0].iov_len = get_rfc1002_length(req) + 4 - count;
2151 iov[1].iov_base = (char *)buf;
2152 iov[1].iov_len = count;
2153
2154 cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
2155 rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
2156 if (rc) {
2157 cFYI(1, "Send error in smb2_lockv = %d", rc);
2158 cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
2159 }
2160
2161 return rc;
2162}
2163
2164int
2165SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
2166 const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
2167 const __u64 length, const __u64 offset, const __u32 lock_flags,
2168 const bool wait)
2169{
2170 struct smb2_lock_element lock;
2171
2172 lock.Offset = cpu_to_le64(offset);
2173 lock.Length = cpu_to_le64(length);
2174 lock.Flags = cpu_to_le32(lock_flags);
2175 if (!wait && lock_flags != SMB2_LOCKFLAG_UNLOCK)
2176 lock.Flags |= cpu_to_le32(SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
2177
2178 return smb2_lockv(xid, tcon, persist_fid, volatile_fid, pid, 1, &lock);
2179}
2180
2181int
2182SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
2183 __u8 *lease_key, const __le32 lease_state)
2184{
2185 int rc;
2186 struct smb2_lease_ack *req = NULL;
2187
2188 cFYI(1, "SMB2_lease_break");
2189 rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
2190
2191 if (rc)
2192 return rc;
2193
2194 req->hdr.CreditRequest = cpu_to_le16(1);
2195 req->StructureSize = cpu_to_le16(36);
2196 inc_rfc1001_len(req, 12);
2197
2198 memcpy(req->LeaseKey, lease_key, 16);
2199 req->LeaseState = lease_state;
2200
2201 rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
2202 /* SMB2 buffer freed by function above */
2203
2204 if (rc) {
2205 cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
2206 cFYI(1, "Send error in Lease Break = %d", rc);
2207 }
2208
2209 return rc;
2210}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index c5fbfac5d576..4cb4ced258cb 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -96,7 +96,7 @@
96 * 96 *
97 */ 97 */
98 98
99#define SMB2_HEADER_STRUCTURE_SIZE __constant_le16_to_cpu(64) 99#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
100 100
101struct smb2_hdr { 101struct smb2_hdr {
102 __be32 smb2_buf_length; /* big endian on wire */ 102 __be32 smb2_buf_length; /* big endian on wire */
@@ -140,7 +140,7 @@ struct smb2_pdu {
140 * 140 *
141 */ 141 */
142 142
143#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_le16_to_cpu(9) 143#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
144 144
145struct smb2_err_rsp { 145struct smb2_err_rsp {
146 struct smb2_hdr hdr; 146 struct smb2_hdr hdr;
@@ -150,6 +150,10 @@ struct smb2_err_rsp {
150 __u8 ErrorData[1]; /* variable length */ 150 __u8 ErrorData[1]; /* variable length */
151} __packed; 151} __packed;
152 152
153#define SMB2_CLIENT_GUID_SIZE 16
154
155extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
156
153struct smb2_negotiate_req { 157struct smb2_negotiate_req {
154 struct smb2_hdr hdr; 158 struct smb2_hdr hdr;
155 __le16 StructureSize; /* Must be 36 */ 159 __le16 StructureSize; /* Must be 36 */
@@ -157,11 +161,17 @@ struct smb2_negotiate_req {
157 __le16 SecurityMode; 161 __le16 SecurityMode;
158 __le16 Reserved; /* MBZ */ 162 __le16 Reserved; /* MBZ */
159 __le32 Capabilities; 163 __le32 Capabilities;
160 __u8 ClientGUID[16]; /* MBZ */ 164 __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
161 __le64 ClientStartTime; /* MBZ */ 165 __le64 ClientStartTime; /* MBZ */
162 __le16 Dialects[2]; /* variable length */ 166 __le16 Dialects[1]; /* One dialect (vers=) at a time for now */
163} __packed; 167} __packed;
164 168
169/* Dialects */
170#define SMB20_PROT_ID 0x0202
171#define SMB21_PROT_ID 0x0210
172#define SMB30_PROT_ID 0x0300
173#define BAD_PROT_ID 0xFFFF
174
165/* SecurityMode flags */ 175/* SecurityMode flags */
166#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 176#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
167#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 177#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
@@ -169,6 +179,10 @@ struct smb2_negotiate_req {
169#define SMB2_GLOBAL_CAP_DFS 0x00000001 179#define SMB2_GLOBAL_CAP_DFS 0x00000001
170#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ 180#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
171#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ 181#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
182#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
183#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
184#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
185#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
172/* Internal types */ 186/* Internal types */
173#define SMB2_NT_FIND 0x00100000 187#define SMB2_NT_FIND 0x00100000
174#define SMB2_LARGE_FILES 0x00200000 188#define SMB2_LARGE_FILES 0x00200000
@@ -307,6 +321,8 @@ struct smb2_tree_disconnect_rsp {
307#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 321#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
308#define SMB2_OPLOCK_LEVEL_BATCH 0x09 322#define SMB2_OPLOCK_LEVEL_BATCH 0x09
309#define SMB2_OPLOCK_LEVEL_LEASE 0xFF 323#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
324/* Non-spec internal type */
325#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
310 326
311/* Desired Access Flags */ 327/* Desired Access Flags */
312#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) 328#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
@@ -404,7 +420,7 @@ struct smb2_create_req {
404 __le16 NameLength; 420 __le16 NameLength;
405 __le32 CreateContextsOffset; 421 __le32 CreateContextsOffset;
406 __le32 CreateContextsLength; 422 __le32 CreateContextsLength;
407 __u8 Buffer[1]; 423 __u8 Buffer[8];
408} __packed; 424} __packed;
409 425
410struct smb2_create_rsp { 426struct smb2_create_rsp {
@@ -428,6 +444,39 @@ struct smb2_create_rsp {
428 __u8 Buffer[1]; 444 __u8 Buffer[1];
429} __packed; 445} __packed;
430 446
447struct create_context {
448 __le32 Next;
449 __le16 NameOffset;
450 __le16 NameLength;
451 __le16 Reserved;
452 __le16 DataOffset;
453 __le32 DataLength;
454 __u8 Buffer[0];
455} __packed;
456
457#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00)
458#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01)
459#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02)
460#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04)
461
462#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02)
463
464#define SMB2_LEASE_KEY_SIZE 16
465
466struct lease_context {
467 __le64 LeaseKeyLow;
468 __le64 LeaseKeyHigh;
469 __le32 LeaseState;
470 __le32 LeaseFlags;
471 __le64 LeaseDuration;
472} __packed;
473
474struct create_lease {
475 struct create_context ccontext;
476 __u8 Name[8];
477 struct lease_context lcontext;
478} __packed;
479
431/* Currently defined values for close flags */ 480/* Currently defined values for close flags */
432#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) 481#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
433struct smb2_close_req { 482struct smb2_close_req {
@@ -453,6 +502,108 @@ struct smb2_close_rsp {
453 __le32 Attributes; 502 __le32 Attributes;
454} __packed; 503} __packed;
455 504
505struct smb2_flush_req {
506 struct smb2_hdr hdr;
507 __le16 StructureSize; /* Must be 24 */
508 __le16 Reserved1;
509 __le32 Reserved2;
510 __u64 PersistentFileId; /* opaque endianness */
511 __u64 VolatileFileId; /* opaque endianness */
512} __packed;
513
514struct smb2_flush_rsp {
515 struct smb2_hdr hdr;
516 __le16 StructureSize;
517 __le16 Reserved;
518} __packed;
519
520struct smb2_read_req {
521 struct smb2_hdr hdr;
522 __le16 StructureSize; /* Must be 49 */
523 __u8 Padding; /* offset from start of SMB2 header to place read */
524 __u8 Reserved;
525 __le32 Length;
526 __le64 Offset;
527 __u64 PersistentFileId; /* opaque endianness */
528 __u64 VolatileFileId; /* opaque endianness */
529 __le32 MinimumCount;
530 __le32 Channel; /* Reserved MBZ */
531 __le32 RemainingBytes;
532 __le16 ReadChannelInfoOffset; /* Reserved MBZ */
533 __le16 ReadChannelInfoLength; /* Reserved MBZ */
534 __u8 Buffer[1];
535} __packed;
536
537struct smb2_read_rsp {
538 struct smb2_hdr hdr;
539 __le16 StructureSize; /* Must be 17 */
540 __u8 DataOffset;
541 __u8 Reserved;
542 __le32 DataLength;
543 __le32 DataRemaining;
544 __u32 Reserved2;
545 __u8 Buffer[1];
546} __packed;
547
548/* For write request Flags field below the following flag is defined: */
549#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
550
551struct smb2_write_req {
552 struct smb2_hdr hdr;
553 __le16 StructureSize; /* Must be 49 */
554 __le16 DataOffset; /* offset from start of SMB2 header to write data */
555 __le32 Length;
556 __le64 Offset;
557 __u64 PersistentFileId; /* opaque endianness */
558 __u64 VolatileFileId; /* opaque endianness */
559 __le32 Channel; /* Reserved MBZ */
560 __le32 RemainingBytes;
561 __le16 WriteChannelInfoOffset; /* Reserved MBZ */
562 __le16 WriteChannelInfoLength; /* Reserved MBZ */
563 __le32 Flags;
564 __u8 Buffer[1];
565} __packed;
566
567struct smb2_write_rsp {
568 struct smb2_hdr hdr;
569 __le16 StructureSize; /* Must be 17 */
570 __u8 DataOffset;
571 __u8 Reserved;
572 __le32 DataLength;
573 __le32 DataRemaining;
574 __u32 Reserved2;
575 __u8 Buffer[1];
576} __packed;
577
578#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001
579#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002
580#define SMB2_LOCKFLAG_UNLOCK 0x0004
581#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
582
583struct smb2_lock_element {
584 __le64 Offset;
585 __le64 Length;
586 __le32 Flags;
587 __le32 Reserved;
588} __packed;
589
590struct smb2_lock_req {
591 struct smb2_hdr hdr;
592 __le16 StructureSize; /* Must be 48 */
593 __le16 LockCount;
594 __le32 Reserved;
595 __u64 PersistentFileId; /* opaque endianness */
596 __u64 VolatileFileId; /* opaque endianness */
597 /* Followed by at least one */
598 struct smb2_lock_element locks[1];
599} __packed;
600
601struct smb2_lock_rsp {
602 struct smb2_hdr hdr;
603 __le16 StructureSize; /* Must be 4 */
604 __le16 Reserved;
605} __packed;
606
456struct smb2_echo_req { 607struct smb2_echo_req {
457 struct smb2_hdr hdr; 608 struct smb2_hdr hdr;
458 __le16 StructureSize; /* Must be 4 */ 609 __le16 StructureSize; /* Must be 4 */
@@ -465,6 +616,34 @@ struct smb2_echo_rsp {
465 __u16 Reserved; 616 __u16 Reserved;
466} __packed; 617} __packed;
467 618
619/* search (query_directory) Flags field */
620#define SMB2_RESTART_SCANS 0x01
621#define SMB2_RETURN_SINGLE_ENTRY 0x02
622#define SMB2_INDEX_SPECIFIED 0x04
623#define SMB2_REOPEN 0x10
624
625struct smb2_query_directory_req {
626 struct smb2_hdr hdr;
627 __le16 StructureSize; /* Must be 33 */
628 __u8 FileInformationClass;
629 __u8 Flags;
630 __le32 FileIndex;
631 __u64 PersistentFileId; /* opaque endianness */
632 __u64 VolatileFileId; /* opaque endianness */
633 __le16 FileNameOffset;
634 __le16 FileNameLength;
635 __le32 OutputBufferLength;
636 __u8 Buffer[1];
637} __packed;
638
639struct smb2_query_directory_rsp {
640 struct smb2_hdr hdr;
641 __le16 StructureSize; /* Must be 9 */
642 __le16 OutputBufferOffset;
643 __le32 OutputBufferLength;
644 __u8 Buffer[1];
645} __packed;
646
468/* Possible InfoType values */ 647/* Possible InfoType values */
469#define SMB2_O_INFO_FILE 0x01 648#define SMB2_O_INFO_FILE 0x01
470#define SMB2_O_INFO_FILESYSTEM 0x02 649#define SMB2_O_INFO_FILESYSTEM 0x02
@@ -495,11 +674,84 @@ struct smb2_query_info_rsp {
495 __u8 Buffer[1]; 674 __u8 Buffer[1];
496} __packed; 675} __packed;
497 676
677struct smb2_set_info_req {
678 struct smb2_hdr hdr;
679 __le16 StructureSize; /* Must be 33 */
680 __u8 InfoType;
681 __u8 FileInfoClass;
682 __le32 BufferLength;
683 __le16 BufferOffset;
684 __u16 Reserved;
685 __le32 AdditionalInformation;
686 __u64 PersistentFileId; /* opaque endianness */
687 __u64 VolatileFileId; /* opaque endianness */
688 __u8 Buffer[1];
689} __packed;
690
691struct smb2_set_info_rsp {
692 struct smb2_hdr hdr;
693 __le16 StructureSize; /* Must be 2 */
694} __packed;
695
696struct smb2_oplock_break {
697 struct smb2_hdr hdr;
698 __le16 StructureSize; /* Must be 24 */
699 __u8 OplockLevel;
700 __u8 Reserved;
701 __le32 Reserved2;
702 __u64 PersistentFid;
703 __u64 VolatileFid;
704} __packed;
705
706#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
707
708struct smb2_lease_break {
709 struct smb2_hdr hdr;
710 __le16 StructureSize; /* Must be 44 */
711 __le16 Reserved;
712 __le32 Flags;
713 __u8 LeaseKey[16];
714 __le32 CurrentLeaseState;
715 __le32 NewLeaseState;
716 __le32 BreakReason;
717 __le32 AccessMaskHint;
718 __le32 ShareMaskHint;
719} __packed;
720
721struct smb2_lease_ack {
722 struct smb2_hdr hdr;
723 __le16 StructureSize; /* Must be 36 */
724 __le16 Reserved;
725 __le32 Flags;
726 __u8 LeaseKey[16];
727 __le32 LeaseState;
728 __le64 LeaseDuration;
729} __packed;
730
498/* 731/*
499 * PDU infolevel structure definitions 732 * PDU infolevel structure definitions
500 * BB consider moving to a different header 733 * BB consider moving to a different header
501 */ 734 */
502 735
736/* File System Information Classes */
737#define FS_VOLUME_INFORMATION 1 /* Query */
738#define FS_LABEL_INFORMATION 2 /* Set */
739#define FS_SIZE_INFORMATION 3 /* Query */
740#define FS_DEVICE_INFORMATION 4 /* Query */
741#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
742#define FS_CONTROL_INFORMATION 6 /* Query, Set */
743#define FS_FULL_SIZE_INFORMATION 7 /* Query */
744#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
745#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
746
747struct smb2_fs_full_size_info {
748 __le64 TotalAllocationUnits;
749 __le64 CallerAvailableAllocationUnits;
750 __le64 ActualAvailableAllocationUnits;
751 __le32 SectorsPerAllocationUnit;
752 __le32 BytesPerSector;
753} __packed;
754
503/* partial list of QUERY INFO levels */ 755/* partial list of QUERY INFO levels */
504#define FILE_DIRECTORY_INFORMATION 1 756#define FILE_DIRECTORY_INFORMATION 1
505#define FILE_FULL_DIRECTORY_INFORMATION 2 757#define FILE_FULL_DIRECTORY_INFORMATION 2
@@ -548,6 +800,28 @@ struct smb2_query_info_rsp {
548#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 800#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
549#define FILE_STANDARD_LINK_INFORMATION 54 801#define FILE_STANDARD_LINK_INFORMATION 54
550 802
803struct smb2_file_internal_info {
804 __le64 IndexNumber;
805} __packed; /* level 6 Query */
806
807struct smb2_file_rename_info { /* encoding of request for level 10 */
808 __u8 ReplaceIfExists; /* 1 = replace existing target with new */
809 /* 0 = fail if target already exists */
810 __u8 Reserved[7];
811 __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
812 __le32 FileNameLength;
813 char FileName[0]; /* New name to be assigned */
814} __packed; /* level 10 Set */
815
816struct smb2_file_link_info { /* encoding of request for level 11 */
817 __u8 ReplaceIfExists; /* 1 = replace existing link with new */
818 /* 0 = fail if link already exists */
819 __u8 Reserved[7];
820 __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
821 __le32 FileNameLength;
822 char FileName[0]; /* Name to be assigned to new link */
823} __packed; /* level 11 Set */
824
551/* 825/*
552 * This level 18, although with struct with same name is different from cifs 826 * This level 18, although with struct with same name is different from cifs
553 * level 0x107. Level 0x107 has an extra u64 between AccessFlags and 827 * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
@@ -576,4 +850,8 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */
576 char FileName[1]; 850 char FileName[1];
577} __packed; /* level 18 Query */ 851} __packed; /* level 18 Query */
578 852
853struct smb2_file_eof_info { /* encoding of request for level 10 */
854 __le64 EndOfFile; /* new end of file value */
855} __packed; /* level 20 Set */
856
579#endif /* _SMB2PDU_H */ 857#endif /* _SMB2PDU_H */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index bfaa7b148afd..7d25f8b14f93 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -26,6 +26,7 @@
26#include <linux/key-type.h> 26#include <linux/key-type.h>
27 27
28struct statfs; 28struct statfs;
29struct smb_rqst;
29 30
30/* 31/*
31 ***************************************************************** 32 *****************************************************************
@@ -34,24 +35,35 @@ struct statfs;
34 */ 35 */
35extern int map_smb2_to_linux_error(char *buf, bool log_err); 36extern int map_smb2_to_linux_error(char *buf, bool log_err);
36extern int smb2_check_message(char *buf, unsigned int length); 37extern int smb2_check_message(char *buf, unsigned int length);
37extern unsigned int smb2_calc_size(struct smb2_hdr *hdr); 38extern unsigned int smb2_calc_size(void *buf);
38extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); 39extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
39extern __le16 *cifs_convert_path_to_utf16(const char *from, 40extern __le16 *cifs_convert_path_to_utf16(const char *from,
40 struct cifs_sb_info *cifs_sb); 41 struct cifs_sb_info *cifs_sb);
41 42
43extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *);
42extern int smb2_check_receive(struct mid_q_entry *mid, 44extern int smb2_check_receive(struct mid_q_entry *mid,
43 struct TCP_Server_Info *server, bool log_error); 45 struct TCP_Server_Info *server, bool log_error);
44extern int smb2_setup_request(struct cifs_ses *ses, struct kvec *iov, 46extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
45 unsigned int nvec, struct mid_q_entry **ret_mid); 47 struct smb_rqst *rqst);
46extern int smb2_setup_async_request(struct TCP_Server_Info *server, 48extern struct mid_q_entry *smb2_setup_async_request(
47 struct kvec *iov, unsigned int nvec, 49 struct TCP_Server_Info *server, struct smb_rqst *rqst);
48 struct mid_q_entry **ret_mid);
49extern void smb2_echo_request(struct work_struct *work); 50extern void smb2_echo_request(struct work_struct *work);
51extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
52extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
53extern bool smb2_is_valid_oplock_break(char *buffer,
54 struct TCP_Server_Info *srv);
50 55
56extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
57 struct smb2_file_all_info *src);
51extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, 58extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
52 struct cifs_sb_info *cifs_sb, 59 struct cifs_sb_info *cifs_sb,
53 const char *full_path, FILE_ALL_INFO *data, 60 const char *full_path, FILE_ALL_INFO *data,
54 bool *adjust_tz); 61 bool *adjust_tz);
62extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
63 const char *full_path, __u64 size,
64 struct cifs_sb_info *cifs_sb, bool set_alloc);
65extern int smb2_set_file_info(struct inode *inode, const char *full_path,
66 FILE_BASIC_INFO *buf, const unsigned int xid);
55extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, 67extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
56 const char *name, struct cifs_sb_info *cifs_sb); 68 const char *name, struct cifs_sb_info *cifs_sb);
57extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, 69extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
@@ -59,6 +71,24 @@ extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
59 struct cifs_tcon *tcon, const unsigned int xid); 71 struct cifs_tcon *tcon, const unsigned int xid);
60extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, 72extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
61 const char *name, struct cifs_sb_info *cifs_sb); 73 const char *name, struct cifs_sb_info *cifs_sb);
74extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
75 const char *name, struct cifs_sb_info *cifs_sb);
76extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
77 const char *from_name, const char *to_name,
78 struct cifs_sb_info *cifs_sb);
79extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
80 const char *from_name, const char *to_name,
81 struct cifs_sb_info *cifs_sb);
82
83extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon,
84 const char *full_path, int disposition,
85 int desired_access, int create_options,
86 struct cifs_fid *fid, __u32 *oplock,
87 FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
88extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
89extern int smb2_unlock_range(struct cifsFileInfo *cfile,
90 struct file_lock *flock, const unsigned int xid);
91extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
62 92
63/* 93/*
64 * SMB2 Worker functions - most of protocol specific implementation details 94 * SMB2 Worker functions - most of protocol specific implementation details
@@ -75,12 +105,55 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
75extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, 105extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
76 __le16 *path, u64 *persistent_fid, u64 *volatile_fid, 106 __le16 *path, u64 *persistent_fid, u64 *volatile_fid,
77 __u32 desired_access, __u32 create_disposition, 107 __u32 desired_access, __u32 create_disposition,
78 __u32 file_attributes, __u32 create_options); 108 __u32 file_attributes, __u32 create_options,
109 __u8 *oplock, struct smb2_file_all_info *buf);
79extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, 110extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
80 u64 persistent_file_id, u64 volatile_file_id); 111 u64 persistent_file_id, u64 volatile_file_id);
112extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
113 u64 persistent_file_id, u64 volatile_file_id);
81extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, 114extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
82 u64 persistent_file_id, u64 volatile_file_id, 115 u64 persistent_file_id, u64 volatile_file_id,
83 struct smb2_file_all_info *data); 116 struct smb2_file_all_info *data);
117extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
118 u64 persistent_fid, u64 volatile_fid,
119 __le64 *uniqueid);
120extern int smb2_async_readv(struct cifs_readdata *rdata);
121extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
122 unsigned int *nbytes, char **buf, int *buf_type);
123extern int smb2_async_writev(struct cifs_writedata *wdata);
124extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
125 unsigned int *nbytes, struct kvec *iov, int n_vec);
84extern int SMB2_echo(struct TCP_Server_Info *server); 126extern int SMB2_echo(struct TCP_Server_Info *server);
127extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
128 u64 persistent_fid, u64 volatile_fid, int index,
129 struct cifs_search_info *srch_inf);
130extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
131 u64 persistent_fid, u64 volatile_fid,
132 __le16 *target_file);
133extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
134 u64 persistent_fid, u64 volatile_fid,
135 __le16 *target_file);
136extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
137 u64 persistent_fid, u64 volatile_fid, u32 pid,
138 __le64 *eof);
139extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
140 u64 persistent_fid, u64 volatile_fid,
141 FILE_BASIC_INFO *buf);
142extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
143 const u64 persistent_fid, const u64 volatile_fid,
144 const __u8 oplock_level);
145extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
146 u64 persistent_file_id, u64 volatile_file_id,
147 struct kstatfs *FSData);
148extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
149 const __u64 persist_fid, const __u64 volatile_fid,
150 const __u32 pid, const __u64 length, const __u64 offset,
151 const __u32 lockFlags, const bool wait);
152extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
153 const __u64 persist_fid, const __u64 volatile_fid,
154 const __u32 pid, const __u32 num_lock,
155 struct smb2_lock_element *buf);
156extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
157 __u8 *lease_key, const __le32 lease_state);
85 158
86#endif /* _SMB2PROTO_H */ 159#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 31f5d420b3ea..2a5fdf26f79f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -30,12 +30,156 @@
30#include <linux/uaccess.h> 30#include <linux/uaccess.h>
31#include <asm/processor.h> 31#include <asm/processor.h>
32#include <linux/mempool.h> 32#include <linux/mempool.h>
33#include <linux/highmem.h>
33#include "smb2pdu.h" 34#include "smb2pdu.h"
34#include "cifsglob.h" 35#include "cifsglob.h"
35#include "cifsproto.h" 36#include "cifsproto.h"
36#include "smb2proto.h" 37#include "smb2proto.h"
37#include "cifs_debug.h" 38#include "cifs_debug.h"
38#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h"
41
42static int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{
45 int i, rc;
46 unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
47 unsigned char *sigptr = smb2_signature;
48 struct kvec *iov = rqst->rq_iov;
49 int n_vec = rqst->rq_nvec;
50 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
51
52 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
53 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
54
55 rc = crypto_shash_setkey(server->secmech.hmacsha256,
56 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
57 if (rc) {
58 cERROR(1, "%s: Could not update with response\n", __func__);
59 return rc;
60 }
61
62 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
63 if (rc) {
64 cERROR(1, "%s: Could not init md5\n", __func__);
65 return rc;
66 }
67
68 for (i = 0; i < n_vec; i++) {
69 if (iov[i].iov_len == 0)
70 continue;
71 if (iov[i].iov_base == NULL) {
72 cERROR(1, "null iovec entry");
73 return -EIO;
74 }
75 /*
76 * The first entry includes a length field (which does not get
77 * signed that occupies the first 4 bytes before the header).
78 */
79 if (i == 0) {
80 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
81 break; /* nothing to sign or corrupt header */
82 rc =
83 crypto_shash_update(
84 &server->secmech.sdeschmacsha256->shash,
85 iov[i].iov_base + 4, iov[i].iov_len - 4);
86 } else {
87 rc =
88 crypto_shash_update(
89 &server->secmech.sdeschmacsha256->shash,
90 iov[i].iov_base, iov[i].iov_len);
91 }
92 if (rc) {
93 cERROR(1, "%s: Could not update with payload\n",
94 __func__);
95 return rc;
96 }
97 }
98
99 /* now hash over the rq_pages array */
100 for (i = 0; i < rqst->rq_npages; i++) {
101 struct kvec p_iov;
102
103 cifs_rqst_page_to_kvec(rqst, i, &p_iov);
104 crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
105 p_iov.iov_base, p_iov.iov_len);
106 kunmap(rqst->rq_pages[i]);
107 }
108
109 rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
110 sigptr);
111 if (rc)
112 cERROR(1, "%s: Could not generate sha256 hash\n", __func__);
113
114 memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
115
116 return rc;
117}
118
119/* must be called with server->srv_mutex held */
120static int
121smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
122{
123 int rc = 0;
124 struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base;
125
126 if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) ||
127 server->tcpStatus == CifsNeedNegotiate)
128 return rc;
129
130 if (!server->session_estab) {
131 strncpy(smb2_pdu->Signature, "BSRSPYL", 8);
132 return rc;
133 }
134
135 rc = smb2_calc_signature(rqst, server);
136
137 return rc;
138}
139
140int
141smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
142{
143 unsigned int rc;
144 char server_response_sig[16];
145 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
146
147 if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
148 (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
149 (!server->session_estab))
150 return 0;
151
152 /*
153 * BB what if signatures are supposed to be on for session but
154 * server does not send one? BB
155 */
156
157 /* Do not need to verify session setups with signature "BSRSPYL " */
158 if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
159 cFYI(1, "dummy signature received for smb command 0x%x",
160 smb2_pdu->Command);
161
162 /*
163 * Save off the origiginal signature so we can modify the smb and check
164 * our calculated signature against what the server sent.
165 */
166 memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE);
167
168 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
169
170 mutex_lock(&server->srv_mutex);
171 rc = smb2_calc_signature(rqst, server);
172 mutex_unlock(&server->srv_mutex);
173
174 if (rc)
175 return rc;
176
177 if (memcmp(server_response_sig, smb2_pdu->Signature,
178 SMB2_SIGNATURE_SIZE))
179 return -EACCES;
180 else
181 return 0;
182}
39 183
40/* 184/*
41 * Set message id for the request. Should be called after wait_for_free_request 185 * Set message id for the request. Should be called after wait_for_free_request
@@ -115,58 +259,66 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
115 bool log_error) 259 bool log_error)
116{ 260{
117 unsigned int len = get_rfc1002_length(mid->resp_buf); 261 unsigned int len = get_rfc1002_length(mid->resp_buf);
262 struct kvec iov;
263 struct smb_rqst rqst = { .rq_iov = &iov,
264 .rq_nvec = 1 };
265
266 iov.iov_base = (char *)mid->resp_buf;
267 iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4;
118 268
119 dump_smb(mid->resp_buf, min_t(u32, 80, len)); 269 dump_smb(mid->resp_buf, min_t(u32, 80, len));
120 /* convert the length into a more usable form */ 270 /* convert the length into a more usable form */
121 /* BB - uncomment with SMB2 signing implementation */ 271 if ((len > 24) &&
122 /* if ((len > 24) &&
123 (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) { 272 (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
124 if (smb2_verify_signature(mid->resp_buf, server)) 273 int rc;
125 cERROR(1, "Unexpected SMB signature"); 274
126 } */ 275 rc = smb2_verify_signature(&rqst, server);
276 if (rc)
277 cERROR(1, "SMB signature verification returned error = "
278 "%d", rc);
279 }
127 280
128 return map_smb2_to_linux_error(mid->resp_buf, log_error); 281 return map_smb2_to_linux_error(mid->resp_buf, log_error);
129} 282}
130 283
131int 284struct mid_q_entry *
132smb2_setup_request(struct cifs_ses *ses, struct kvec *iov, 285smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
133 unsigned int nvec, struct mid_q_entry **ret_mid)
134{ 286{
135 int rc; 287 int rc;
136 struct smb2_hdr *hdr = (struct smb2_hdr *)iov[0].iov_base; 288 struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
137 struct mid_q_entry *mid; 289 struct mid_q_entry *mid;
138 290
139 smb2_seq_num_into_buf(ses->server, hdr); 291 smb2_seq_num_into_buf(ses->server, hdr);
140 292
141 rc = smb2_get_mid_entry(ses, hdr, &mid); 293 rc = smb2_get_mid_entry(ses, hdr, &mid);
142 if (rc) 294 if (rc)
143 return rc; 295 return ERR_PTR(rc);
144 /* rc = smb2_sign_smb2(iov, nvec, ses->server); 296 rc = smb2_sign_rqst(rqst, ses->server);
145 if (rc) 297 if (rc) {
146 delete_mid(mid); */ 298 cifs_delete_mid(mid);
147 *ret_mid = mid; 299 return ERR_PTR(rc);
148 return rc; 300 }
301 return mid;
149} 302}
150 303
151int 304struct mid_q_entry *
152smb2_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, 305smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
153 unsigned int nvec, struct mid_q_entry **ret_mid)
154{ 306{
155 int rc = 0; 307 int rc;
156 struct smb2_hdr *hdr = (struct smb2_hdr *)iov[0].iov_base; 308 struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
157 struct mid_q_entry *mid; 309 struct mid_q_entry *mid;
158 310
159 smb2_seq_num_into_buf(server, hdr); 311 smb2_seq_num_into_buf(server, hdr);
160 312
161 mid = smb2_mid_entry_alloc(hdr, server); 313 mid = smb2_mid_entry_alloc(hdr, server);
162 if (mid == NULL) 314 if (mid == NULL)
163 return -ENOMEM; 315 return ERR_PTR(-ENOMEM);
164 316
165 /* rc = smb2_sign_smb2(iov, nvec, server); 317 rc = smb2_sign_rqst(rqst, server);
166 if (rc) { 318 if (rc) {
167 DeleteMidQEntry(mid); 319 DeleteMidQEntry(mid);
168 return rc; 320 return ERR_PTR(rc);
169 }*/ 321 }
170 *ret_mid = mid; 322
171 return rc; 323 return mid;
172} 324}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d9b639b95fa8..76d974c952fe 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -27,6 +27,8 @@
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/freezer.h> 29#include <linux/freezer.h>
30#include <linux/tcp.h>
31#include <linux/highmem.h>
30#include <asm/uaccess.h> 32#include <asm/uaccess.h>
31#include <asm/processor.h> 33#include <asm/processor.h>
32#include <linux/mempool.h> 34#include <linux/mempool.h>
@@ -109,8 +111,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
109 mempool_free(midEntry, cifs_mid_poolp); 111 mempool_free(midEntry, cifs_mid_poolp);
110} 112}
111 113
112static void 114void
113delete_mid(struct mid_q_entry *mid) 115cifs_delete_mid(struct mid_q_entry *mid)
114{ 116{
115 spin_lock(&GlobalMid_Lock); 117 spin_lock(&GlobalMid_Lock);
116 list_del(&mid->qhead); 118 list_del(&mid->qhead);
@@ -119,18 +121,29 @@ delete_mid(struct mid_q_entry *mid)
119 DeleteMidQEntry(mid); 121 DeleteMidQEntry(mid);
120} 122}
121 123
124/*
125 * smb_send_kvec - send an array of kvecs to the server
126 * @server: Server to send the data to
127 * @iov: Pointer to array of kvecs
128 * @n_vec: length of kvec array
129 * @sent: amount of data sent on socket is stored here
130 *
131 * Our basic "send data to server" function. Should be called with srv_mutex
132 * held. The caller is responsible for handling the results.
133 */
122static int 134static int
123smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 135smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
136 size_t *sent)
124{ 137{
125 int rc = 0; 138 int rc = 0;
126 int i = 0; 139 int i = 0;
127 struct msghdr smb_msg; 140 struct msghdr smb_msg;
128 unsigned int len = iov[0].iov_len; 141 unsigned int remaining;
129 unsigned int total_len; 142 size_t first_vec = 0;
130 int first_vec = 0;
131 unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
132 struct socket *ssocket = server->ssocket; 143 struct socket *ssocket = server->ssocket;
133 144
145 *sent = 0;
146
134 if (ssocket == NULL) 147 if (ssocket == NULL)
135 return -ENOTSOCK; /* BB eventually add reconnect code here */ 148 return -ENOTSOCK; /* BB eventually add reconnect code here */
136 149
@@ -143,56 +156,66 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
143 else 156 else
144 smb_msg.msg_flags = MSG_NOSIGNAL; 157 smb_msg.msg_flags = MSG_NOSIGNAL;
145 158
146 total_len = 0; 159 remaining = 0;
147 for (i = 0; i < n_vec; i++) 160 for (i = 0; i < n_vec; i++)
148 total_len += iov[i].iov_len; 161 remaining += iov[i].iov_len;
149
150 cFYI(1, "Sending smb: total_len %d", total_len);
151 dump_smb(iov[0].iov_base, len);
152 162
153 i = 0; 163 i = 0;
154 while (total_len) { 164 while (remaining) {
165 /*
166 * If blocking send, we try 3 times, since each can block
167 * for 5 seconds. For nonblocking we have to try more
168 * but wait increasing amounts of time allowing time for
169 * socket to clear. The overall time we wait in either
170 * case to send on the socket is about 15 seconds.
171 * Similarly we wait for 15 seconds for a response from
172 * the server in SendReceive[2] for the server to send
173 * a response back for most types of requests (except
174 * SMB Write past end of file which can be slow, and
175 * blocking lock operations). NFS waits slightly longer
176 * than CIFS, but this can make it take longer for
177 * nonresponsive servers to be detected and 15 seconds
178 * is more than enough time for modern networks to
179 * send a packet. In most cases if we fail to send
180 * after the retries we will kill the socket and
181 * reconnect which may clear the network problem.
182 */
155 rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], 183 rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
156 n_vec - first_vec, total_len); 184 n_vec - first_vec, remaining);
157 if ((rc == -ENOSPC) || (rc == -EAGAIN)) { 185 if (rc == -ENOSPC || rc == -EAGAIN) {
158 i++;
159 /* 186 /*
160 * If blocking send we try 3 times, since each can block 187 * Catch if a low level driver returns -ENOSPC. This
161 * for 5 seconds. For nonblocking we have to try more 188 * WARN_ON will be removed by 3.10 if no one reports
162 * but wait increasing amounts of time allowing time for 189 * seeing this.
163 * socket to clear. The overall time we wait in either
164 * case to send on the socket is about 15 seconds.
165 * Similarly we wait for 15 seconds for a response from
166 * the server in SendReceive[2] for the server to send
167 * a response back for most types of requests (except
168 * SMB Write past end of file which can be slow, and
169 * blocking lock operations). NFS waits slightly longer
170 * than CIFS, but this can make it take longer for
171 * nonresponsive servers to be detected and 15 seconds
172 * is more than enough time for modern networks to
173 * send a packet. In most cases if we fail to send
174 * after the retries we will kill the socket and
175 * reconnect which may clear the network problem.
176 */ 190 */
177 if ((i >= 14) || (!server->noblocksnd && (i > 2))) { 191 WARN_ON_ONCE(rc == -ENOSPC);
178 cERROR(1, "sends on sock %p stuck for 15 seconds", 192 i++;
179 ssocket); 193 if (i >= 14 || (!server->noblocksnd && (i > 2))) {
194 cERROR(1, "sends on sock %p stuck for 15 "
195 "seconds", ssocket);
180 rc = -EAGAIN; 196 rc = -EAGAIN;
181 break; 197 break;
182 } 198 }
183 msleep(1 << i); 199 msleep(1 << i);
184 continue; 200 continue;
185 } 201 }
202
186 if (rc < 0) 203 if (rc < 0)
187 break; 204 break;
188 205
189 if (rc == total_len) { 206 /* send was at least partially successful */
190 total_len = 0; 207 *sent += rc;
208
209 if (rc == remaining) {
210 remaining = 0;
191 break; 211 break;
192 } else if (rc > total_len) { 212 }
193 cERROR(1, "sent %d requested %d", rc, total_len); 213
214 if (rc > remaining) {
215 cERROR(1, "sent %d requested %d", rc, remaining);
194 break; 216 break;
195 } 217 }
218
196 if (rc == 0) { 219 if (rc == 0) {
197 /* should never happen, letting socket clear before 220 /* should never happen, letting socket clear before
198 retrying is our only obvious option here */ 221 retrying is our only obvious option here */
@@ -200,7 +223,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
200 msleep(500); 223 msleep(500);
201 continue; 224 continue;
202 } 225 }
203 total_len -= rc; 226
227 remaining -= rc;
228
204 /* the line below resets i */ 229 /* the line below resets i */
205 for (i = first_vec; i < n_vec; i++) { 230 for (i = first_vec; i < n_vec; i++) {
206 if (iov[i].iov_len) { 231 if (iov[i].iov_len) {
@@ -215,16 +240,97 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
215 } 240 }
216 } 241 }
217 } 242 }
243
218 i = 0; /* in case we get ENOSPC on the next send */ 244 i = 0; /* in case we get ENOSPC on the next send */
245 rc = 0;
219 } 246 }
247 return rc;
248}
249
250/**
251 * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec
252 * @rqst: pointer to smb_rqst
253 * @idx: index into the array of the page
254 * @iov: pointer to struct kvec that will hold the result
255 *
256 * Helper function to convert a slot in the rqst->rq_pages array into a kvec.
257 * The page will be kmapped and the address placed into iov_base. The length
258 * will then be adjusted according to the ptailoff.
259 */
260void
261cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
262 struct kvec *iov)
263{
264 /*
265 * FIXME: We could avoid this kmap altogether if we used
266 * kernel_sendpage instead of kernel_sendmsg. That will only
267 * work if signing is disabled though as sendpage inlines the
268 * page directly into the fraglist. If userspace modifies the
269 * page after we calculate the signature, then the server will
270 * reject it and may break the connection. kernel_sendmsg does
271 * an extra copy of the data and avoids that issue.
272 */
273 iov->iov_base = kmap(rqst->rq_pages[idx]);
274
275 /* if last page, don't send beyond this offset into page */
276 if (idx == (rqst->rq_npages - 1))
277 iov->iov_len = rqst->rq_tailsz;
278 else
279 iov->iov_len = rqst->rq_pagesz;
280}
281
282static int
283smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
284{
285 int rc;
286 struct kvec *iov = rqst->rq_iov;
287 int n_vec = rqst->rq_nvec;
288 unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
289 unsigned int i;
290 size_t total_len = 0, sent;
291 struct socket *ssocket = server->ssocket;
292 int val = 1;
293
294 cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
295 dump_smb(iov[0].iov_base, iov[0].iov_len);
296
297 /* cork the socket */
298 kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
299 (char *)&val, sizeof(val));
300
301 rc = smb_send_kvec(server, iov, n_vec, &sent);
302 if (rc < 0)
303 goto uncork;
304
305 total_len += sent;
306
307 /* now walk the page array and send each page in it */
308 for (i = 0; i < rqst->rq_npages; i++) {
309 struct kvec p_iov;
310
311 cifs_rqst_page_to_kvec(rqst, i, &p_iov);
312 rc = smb_send_kvec(server, &p_iov, 1, &sent);
313 kunmap(rqst->rq_pages[i]);
314 if (rc < 0)
315 break;
316
317 total_len += sent;
318 }
319
320uncork:
321 /* uncork it */
322 val = 0;
323 kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
324 (char *)&val, sizeof(val));
220 325
221 if ((total_len > 0) && (total_len != smb_buf_length + 4)) { 326 if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
222 cFYI(1, "partial send (%d remaining), terminating session", 327 cFYI(1, "partial send (wanted=%u sent=%zu): terminating "
223 total_len); 328 "session", smb_buf_length + 4, total_len);
224 /* If we have only sent part of an SMB then the next SMB 329 /*
225 could be taken as the remainder of this one. We need 330 * If we have only sent part of an SMB then the next SMB could
226 to kill the socket so the server throws away the partial 331 * be taken as the remainder of this one. We need to kill the
227 SMB */ 332 * socket so the server throws away the partial SMB
333 */
228 server->tcpStatus = CifsNeedReconnect; 334 server->tcpStatus = CifsNeedReconnect;
229 } 335 }
230 336
@@ -236,6 +342,15 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
236 return rc; 342 return rc;
237} 343}
238 344
345static int
346smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
347{
348 struct smb_rqst rqst = { .rq_iov = iov,
349 .rq_nvec = n_vec };
350
351 return smb_send_rqst(server, &rqst);
352}
353
239int 354int
240smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, 355smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
241 unsigned int smb_buf_length) 356 unsigned int smb_buf_length)
@@ -345,12 +460,11 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
345 return 0; 460 return 0;
346} 461}
347 462
348int 463struct mid_q_entry *
349cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, 464cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
350 unsigned int nvec, struct mid_q_entry **ret_mid)
351{ 465{
352 int rc; 466 int rc;
353 struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; 467 struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
354 struct mid_q_entry *mid; 468 struct mid_q_entry *mid;
355 469
356 /* enable signing if server requires it */ 470 /* enable signing if server requires it */
@@ -359,16 +473,15 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov,
359 473
360 mid = AllocMidQEntry(hdr, server); 474 mid = AllocMidQEntry(hdr, server);
361 if (mid == NULL) 475 if (mid == NULL)
362 return -ENOMEM; 476 return ERR_PTR(-ENOMEM);
363 477
364 rc = cifs_sign_smbv(iov, nvec, server, &mid->sequence_number); 478 rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
365 if (rc) { 479 if (rc) {
366 DeleteMidQEntry(mid); 480 DeleteMidQEntry(mid);
367 return rc; 481 return ERR_PTR(rc);
368 } 482 }
369 483
370 *ret_mid = mid; 484 return mid;
371 return 0;
372} 485}
373 486
374/* 487/*
@@ -376,9 +489,9 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov,
376 * the result. Caller is responsible for dealing with timeouts. 489 * the result. Caller is responsible for dealing with timeouts.
377 */ 490 */
378int 491int
379cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 492cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
380 unsigned int nvec, mid_receive_t *receive, 493 mid_receive_t *receive, mid_callback_t *callback,
381 mid_callback_t *callback, void *cbdata, const int flags) 494 void *cbdata, const int flags)
382{ 495{
383 int rc, timeout, optype; 496 int rc, timeout, optype;
384 struct mid_q_entry *mid; 497 struct mid_q_entry *mid;
@@ -391,12 +504,12 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
391 return rc; 504 return rc;
392 505
393 mutex_lock(&server->srv_mutex); 506 mutex_lock(&server->srv_mutex);
394 rc = server->ops->setup_async_request(server, iov, nvec, &mid); 507 mid = server->ops->setup_async_request(server, rqst);
395 if (rc) { 508 if (IS_ERR(mid)) {
396 mutex_unlock(&server->srv_mutex); 509 mutex_unlock(&server->srv_mutex);
397 add_credits(server, 1, optype); 510 add_credits(server, 1, optype);
398 wake_up(&server->request_q); 511 wake_up(&server->request_q);
399 return rc; 512 return PTR_ERR(mid);
400 } 513 }
401 514
402 mid->receive = receive; 515 mid->receive = receive;
@@ -411,7 +524,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
411 524
412 525
413 cifs_in_send_inc(server); 526 cifs_in_send_inc(server);
414 rc = smb_sendv(server, iov, nvec); 527 rc = smb_send_rqst(server, rqst);
415 cifs_in_send_dec(server); 528 cifs_in_send_dec(server);
416 cifs_save_when_sent(mid); 529 cifs_save_when_sent(mid);
417 mutex_unlock(&server->srv_mutex); 530 mutex_unlock(&server->srv_mutex);
@@ -419,7 +532,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
419 if (rc == 0) 532 if (rc == 0)
420 return 0; 533 return 0;
421 534
422 delete_mid(mid); 535 cifs_delete_mid(mid);
423 add_credits(server, 1, optype); 536 add_credits(server, 1, optype);
424 wake_up(&server->request_q); 537 wake_up(&server->request_q);
425 return rc; 538 return rc;
@@ -504,11 +617,13 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
504 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 617 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
505 struct kvec iov; 618 struct kvec iov;
506 int rc = 0; 619 int rc = 0;
620 struct smb_rqst rqst = { .rq_iov = &iov,
621 .rq_nvec = 1 };
507 622
508 iov.iov_base = mid->resp_buf; 623 iov.iov_base = mid->resp_buf;
509 iov.iov_len = len; 624 iov.iov_len = len;
510 /* FIXME: add code to kill session */ 625 /* FIXME: add code to kill session */
511 rc = cifs_verify_signature(&iov, 1, server, 626 rc = cifs_verify_signature(&rqst, server,
512 mid->sequence_number + 1); 627 mid->sequence_number + 1);
513 if (rc) 628 if (rc)
514 cERROR(1, "SMB signature verification returned error = " 629 cERROR(1, "SMB signature verification returned error = "
@@ -519,22 +634,22 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
519 return map_smb_to_linux_error(mid->resp_buf, log_error); 634 return map_smb_to_linux_error(mid->resp_buf, log_error);
520} 635}
521 636
522int 637struct mid_q_entry *
523cifs_setup_request(struct cifs_ses *ses, struct kvec *iov, 638cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
524 unsigned int nvec, struct mid_q_entry **ret_mid)
525{ 639{
526 int rc; 640 int rc;
527 struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; 641 struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
528 struct mid_q_entry *mid; 642 struct mid_q_entry *mid;
529 643
530 rc = allocate_mid(ses, hdr, &mid); 644 rc = allocate_mid(ses, hdr, &mid);
531 if (rc) 645 if (rc)
532 return rc; 646 return ERR_PTR(rc);
533 rc = cifs_sign_smbv(iov, nvec, ses->server, &mid->sequence_number); 647 rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
534 if (rc) 648 if (rc) {
535 delete_mid(mid); 649 cifs_delete_mid(mid);
536 *ret_mid = mid; 650 return ERR_PTR(rc);
537 return rc; 651 }
652 return mid;
538} 653}
539 654
540int 655int
@@ -547,6 +662,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
547 struct mid_q_entry *midQ; 662 struct mid_q_entry *midQ;
548 char *buf = iov[0].iov_base; 663 char *buf = iov[0].iov_base;
549 unsigned int credits = 1; 664 unsigned int credits = 1;
665 struct smb_rqst rqst = { .rq_iov = iov,
666 .rq_nvec = n_vec };
550 667
551 timeout = flags & CIFS_TIMEOUT_MASK; 668 timeout = flags & CIFS_TIMEOUT_MASK;
552 optype = flags & CIFS_OP_MASK; 669 optype = flags & CIFS_OP_MASK;
@@ -584,13 +701,13 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
584 701
585 mutex_lock(&ses->server->srv_mutex); 702 mutex_lock(&ses->server->srv_mutex);
586 703
587 rc = ses->server->ops->setup_request(ses, iov, n_vec, &midQ); 704 midQ = ses->server->ops->setup_request(ses, &rqst);
588 if (rc) { 705 if (IS_ERR(midQ)) {
589 mutex_unlock(&ses->server->srv_mutex); 706 mutex_unlock(&ses->server->srv_mutex);
590 cifs_small_buf_release(buf); 707 cifs_small_buf_release(buf);
591 /* Update # of requests on wire to server */ 708 /* Update # of requests on wire to server */
592 add_credits(ses->server, 1, optype); 709 add_credits(ses->server, 1, optype);
593 return rc; 710 return PTR_ERR(midQ);
594 } 711 }
595 712
596 midQ->mid_state = MID_REQUEST_SUBMITTED; 713 midQ->mid_state = MID_REQUEST_SUBMITTED;
@@ -652,11 +769,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
652 rc = ses->server->ops->check_receive(midQ, ses->server, 769 rc = ses->server->ops->check_receive(midQ, ses->server,
653 flags & CIFS_LOG_ERROR); 770 flags & CIFS_LOG_ERROR);
654 771
655 /* mark it so buf will not be freed by delete_mid */ 772 /* mark it so buf will not be freed by cifs_delete_mid */
656 if ((flags & CIFS_NO_RESP) == 0) 773 if ((flags & CIFS_NO_RESP) == 0)
657 midQ->resp_buf = NULL; 774 midQ->resp_buf = NULL;
658out: 775out:
659 delete_mid(midQ); 776 cifs_delete_mid(midQ);
660 add_credits(ses->server, credits, optype); 777 add_credits(ses->server, credits, optype);
661 778
662 return rc; 779 return rc;
@@ -762,7 +879,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
762 memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); 879 memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
763 rc = cifs_check_receive(midQ, ses->server, 0); 880 rc = cifs_check_receive(midQ, ses->server, 0);
764out: 881out:
765 delete_mid(midQ); 882 cifs_delete_mid(midQ);
766 add_credits(ses->server, 1, 0); 883 add_credits(ses->server, 1, 0);
767 884
768 return rc; 885 return rc;
@@ -846,7 +963,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
846 963
847 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); 964 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
848 if (rc) { 965 if (rc) {
849 delete_mid(midQ); 966 cifs_delete_mid(midQ);
850 mutex_unlock(&ses->server->srv_mutex); 967 mutex_unlock(&ses->server->srv_mutex);
851 return rc; 968 return rc;
852 } 969 }
@@ -859,7 +976,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
859 mutex_unlock(&ses->server->srv_mutex); 976 mutex_unlock(&ses->server->srv_mutex);
860 977
861 if (rc < 0) { 978 if (rc < 0) {
862 delete_mid(midQ); 979 cifs_delete_mid(midQ);
863 return rc; 980 return rc;
864 } 981 }
865 982
@@ -880,7 +997,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
880 blocking lock to return. */ 997 blocking lock to return. */
881 rc = send_cancel(ses->server, in_buf, midQ); 998 rc = send_cancel(ses->server, in_buf, midQ);
882 if (rc) { 999 if (rc) {
883 delete_mid(midQ); 1000 cifs_delete_mid(midQ);
884 return rc; 1001 return rc;
885 } 1002 }
886 } else { 1003 } else {
@@ -892,7 +1009,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
892 /* If we get -ENOLCK back the lock may have 1009 /* If we get -ENOLCK back the lock may have
893 already been removed. Don't exit in this case. */ 1010 already been removed. Don't exit in this case. */
894 if (rc && rc != -ENOLCK) { 1011 if (rc && rc != -ENOLCK) {
895 delete_mid(midQ); 1012 cifs_delete_mid(midQ);
896 return rc; 1013 return rc;
897 } 1014 }
898 } 1015 }
@@ -929,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
929 memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); 1046 memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
930 rc = cifs_check_receive(midQ, ses->server, 0); 1047 rc = cifs_check_receive(midQ, ses->server, 0);
931out: 1048out:
932 delete_mid(midQ); 1049 cifs_delete_mid(midQ);
933 if (rstart && rc == -EACCES) 1050 if (rstart && rc == -EACCES)
934 return -ERESTARTSYS; 1051 return -ERESTARTSYS;
935 return rc; 1052 return rc;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index f1813120d753..be2aa4909487 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -85,6 +85,11 @@ int coda_init_inodecache(void)
85 85
86void coda_destroy_inodecache(void) 86void coda_destroy_inodecache(void)
87{ 87{
88 /*
89 * Make sure all delayed rcu free inodes are flushed before we
90 * destroy cache.
91 */
92 rcu_barrier();
88 kmem_cache_destroy(coda_inode_cachep); 93 kmem_cache_destroy(coda_inode_cachep);
89} 94}
90 95
@@ -107,43 +112,41 @@ static const struct super_operations coda_super_operations =
107 112
108static int get_device_index(struct coda_mount_data *data) 113static int get_device_index(struct coda_mount_data *data)
109{ 114{
110 struct file *file; 115 struct fd f;
111 struct inode *inode; 116 struct inode *inode;
112 int idx; 117 int idx;
113 118
114 if(data == NULL) { 119 if (data == NULL) {
115 printk("coda_read_super: Bad mount data\n"); 120 printk("coda_read_super: Bad mount data\n");
116 return -1; 121 return -1;
117 } 122 }
118 123
119 if(data->version != CODA_MOUNT_VERSION) { 124 if (data->version != CODA_MOUNT_VERSION) {
120 printk("coda_read_super: Bad mount version\n"); 125 printk("coda_read_super: Bad mount version\n");
121 return -1; 126 return -1;
122 } 127 }
123 128
124 file = fget(data->fd); 129 f = fdget(data->fd);
125 inode = NULL; 130 if (!f.file)
126 if(file) 131 goto Ebadf;
127 inode = file->f_path.dentry->d_inode; 132 inode = f.file->f_path.dentry->d_inode;
128 133 if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
129 if(!inode || !S_ISCHR(inode->i_mode) || 134 fdput(f);
130 imajor(inode) != CODA_PSDEV_MAJOR) { 135 goto Ebadf;
131 if(file)
132 fput(file);
133
134 printk("coda_read_super: Bad file\n");
135 return -1;
136 } 136 }
137 137
138 idx = iminor(inode); 138 idx = iminor(inode);
139 fput(file); 139 fdput(f);
140 140
141 if(idx < 0 || idx >= MAX_CODADEVS) { 141 if (idx < 0 || idx >= MAX_CODADEVS) {
142 printk("coda_read_super: Bad minor number\n"); 142 printk("coda_read_super: Bad minor number\n");
143 return -1; 143 return -1;
144 } 144 }
145 145
146 return idx; 146 return idx;
147Ebadf:
148 printk("coda_read_super: Bad file\n");
149 return -1;
147} 150}
148 151
149static int coda_fill_super(struct super_block *sb, void *data, int silent) 152static int coda_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/compat.c b/fs/compat.c
index 1bdb350ea5d3..015e1e1f87c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -776,16 +776,16 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
776 char *kernel_type; 776 char *kernel_type;
777 unsigned long data_page; 777 unsigned long data_page;
778 char *kernel_dev; 778 char *kernel_dev;
779 char *dir_page; 779 struct filename *dir;
780 int retval; 780 int retval;
781 781
782 retval = copy_mount_string(type, &kernel_type); 782 retval = copy_mount_string(type, &kernel_type);
783 if (retval < 0) 783 if (retval < 0)
784 goto out; 784 goto out;
785 785
786 dir_page = getname(dir_name); 786 dir = getname(dir_name);
787 retval = PTR_ERR(dir_page); 787 retval = PTR_ERR(dir);
788 if (IS_ERR(dir_page)) 788 if (IS_ERR(dir))
789 goto out1; 789 goto out1;
790 790
791 retval = copy_mount_string(dev_name, &kernel_dev); 791 retval = copy_mount_string(dev_name, &kernel_dev);
@@ -807,7 +807,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
807 } 807 }
808 } 808 }
809 809
810 retval = do_mount(kernel_dev, dir_page, kernel_type, 810 retval = do_mount(kernel_dev, dir->name, kernel_type,
811 flags, (void*)data_page); 811 flags, (void*)data_page);
812 812
813 out4: 813 out4:
@@ -815,7 +815,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
815 out3: 815 out3:
816 kfree(kernel_dev); 816 kfree(kernel_dev);
817 out2: 817 out2:
818 putname(dir_page); 818 putname(dir);
819 out1: 819 out1:
820 kfree(kernel_type); 820 kfree(kernel_type);
821 out: 821 out:
@@ -870,22 +870,20 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
870 struct compat_old_linux_dirent __user *dirent, unsigned int count) 870 struct compat_old_linux_dirent __user *dirent, unsigned int count)
871{ 871{
872 int error; 872 int error;
873 struct file *file; 873 struct fd f = fdget(fd);
874 int fput_needed;
875 struct compat_readdir_callback buf; 874 struct compat_readdir_callback buf;
876 875
877 file = fget_light(fd, &fput_needed); 876 if (!f.file)
878 if (!file)
879 return -EBADF; 877 return -EBADF;
880 878
881 buf.result = 0; 879 buf.result = 0;
882 buf.dirent = dirent; 880 buf.dirent = dirent;
883 881
884 error = vfs_readdir(file, compat_fillonedir, &buf); 882 error = vfs_readdir(f.file, compat_fillonedir, &buf);
885 if (buf.result) 883 if (buf.result)
886 error = buf.result; 884 error = buf.result;
887 885
888 fput_light(file, fput_needed); 886 fdput(f);
889 return error; 887 return error;
890} 888}
891 889
@@ -949,17 +947,16 @@ efault:
949asmlinkage long compat_sys_getdents(unsigned int fd, 947asmlinkage long compat_sys_getdents(unsigned int fd,
950 struct compat_linux_dirent __user *dirent, unsigned int count) 948 struct compat_linux_dirent __user *dirent, unsigned int count)
951{ 949{
952 struct file * file; 950 struct fd f;
953 struct compat_linux_dirent __user * lastdirent; 951 struct compat_linux_dirent __user * lastdirent;
954 struct compat_getdents_callback buf; 952 struct compat_getdents_callback buf;
955 int fput_needed;
956 int error; 953 int error;
957 954
958 if (!access_ok(VERIFY_WRITE, dirent, count)) 955 if (!access_ok(VERIFY_WRITE, dirent, count))
959 return -EFAULT; 956 return -EFAULT;
960 957
961 file = fget_light(fd, &fput_needed); 958 f = fdget(fd);
962 if (!file) 959 if (!f.file)
963 return -EBADF; 960 return -EBADF;
964 961
965 buf.current_dir = dirent; 962 buf.current_dir = dirent;
@@ -967,17 +964,17 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
967 buf.count = count; 964 buf.count = count;
968 buf.error = 0; 965 buf.error = 0;
969 966
970 error = vfs_readdir(file, compat_filldir, &buf); 967 error = vfs_readdir(f.file, compat_filldir, &buf);
971 if (error >= 0) 968 if (error >= 0)
972 error = buf.error; 969 error = buf.error;
973 lastdirent = buf.previous; 970 lastdirent = buf.previous;
974 if (lastdirent) { 971 if (lastdirent) {
975 if (put_user(file->f_pos, &lastdirent->d_off)) 972 if (put_user(f.file->f_pos, &lastdirent->d_off))
976 error = -EFAULT; 973 error = -EFAULT;
977 else 974 else
978 error = count - buf.count; 975 error = count - buf.count;
979 } 976 }
980 fput_light(file, fput_needed); 977 fdput(f);
981 return error; 978 return error;
982} 979}
983 980
@@ -1035,17 +1032,16 @@ efault:
1035asmlinkage long compat_sys_getdents64(unsigned int fd, 1032asmlinkage long compat_sys_getdents64(unsigned int fd,
1036 struct linux_dirent64 __user * dirent, unsigned int count) 1033 struct linux_dirent64 __user * dirent, unsigned int count)
1037{ 1034{
1038 struct file * file; 1035 struct fd f;
1039 struct linux_dirent64 __user * lastdirent; 1036 struct linux_dirent64 __user * lastdirent;
1040 struct compat_getdents_callback64 buf; 1037 struct compat_getdents_callback64 buf;
1041 int fput_needed;
1042 int error; 1038 int error;
1043 1039
1044 if (!access_ok(VERIFY_WRITE, dirent, count)) 1040 if (!access_ok(VERIFY_WRITE, dirent, count))
1045 return -EFAULT; 1041 return -EFAULT;
1046 1042
1047 file = fget_light(fd, &fput_needed); 1043 f = fdget(fd);
1048 if (!file) 1044 if (!f.file)
1049 return -EBADF; 1045 return -EBADF;
1050 1046
1051 buf.current_dir = dirent; 1047 buf.current_dir = dirent;
@@ -1053,18 +1049,18 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1053 buf.count = count; 1049 buf.count = count;
1054 buf.error = 0; 1050 buf.error = 0;
1055 1051
1056 error = vfs_readdir(file, compat_filldir64, &buf); 1052 error = vfs_readdir(f.file, compat_filldir64, &buf);
1057 if (error >= 0) 1053 if (error >= 0)
1058 error = buf.error; 1054 error = buf.error;
1059 lastdirent = buf.previous; 1055 lastdirent = buf.previous;
1060 if (lastdirent) { 1056 if (lastdirent) {
1061 typeof(lastdirent->d_off) d_off = file->f_pos; 1057 typeof(lastdirent->d_off) d_off = f.file->f_pos;
1062 if (__put_user_unaligned(d_off, &lastdirent->d_off)) 1058 if (__put_user_unaligned(d_off, &lastdirent->d_off))
1063 error = -EFAULT; 1059 error = -EFAULT;
1064 else 1060 else
1065 error = count - buf.count; 1061 error = count - buf.count;
1066 } 1062 }
1067 fput_light(file, fput_needed); 1063 fdput(f);
1068 return error; 1064 return error;
1069} 1065}
1070#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ 1066#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
@@ -1152,18 +1148,16 @@ asmlinkage ssize_t
1152compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, 1148compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1153 unsigned long vlen) 1149 unsigned long vlen)
1154{ 1150{
1155 struct file *file; 1151 struct fd f = fdget(fd);
1156 int fput_needed;
1157 ssize_t ret; 1152 ssize_t ret;
1158 loff_t pos; 1153 loff_t pos;
1159 1154
1160 file = fget_light(fd, &fput_needed); 1155 if (!f.file)
1161 if (!file)
1162 return -EBADF; 1156 return -EBADF;
1163 pos = file->f_pos; 1157 pos = f.file->f_pos;
1164 ret = compat_readv(file, vec, vlen, &pos); 1158 ret = compat_readv(f.file, vec, vlen, &pos);
1165 file->f_pos = pos; 1159 f.file->f_pos = pos;
1166 fput_light(file, fput_needed); 1160 fdput(f);
1167 return ret; 1161 return ret;
1168} 1162}
1169 1163
@@ -1171,19 +1165,18 @@ asmlinkage ssize_t
1171compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, 1165compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,
1172 unsigned long vlen, loff_t pos) 1166 unsigned long vlen, loff_t pos)
1173{ 1167{
1174 struct file *file; 1168 struct fd f;
1175 int fput_needed;
1176 ssize_t ret; 1169 ssize_t ret;
1177 1170
1178 if (pos < 0) 1171 if (pos < 0)
1179 return -EINVAL; 1172 return -EINVAL;
1180 file = fget_light(fd, &fput_needed); 1173 f = fdget(fd);
1181 if (!file) 1174 if (!f.file)
1182 return -EBADF; 1175 return -EBADF;
1183 ret = -ESPIPE; 1176 ret = -ESPIPE;
1184 if (file->f_mode & FMODE_PREAD) 1177 if (f.file->f_mode & FMODE_PREAD)
1185 ret = compat_readv(file, vec, vlen, &pos); 1178 ret = compat_readv(f.file, vec, vlen, &pos);
1186 fput_light(file, fput_needed); 1179 fdput(f);
1187 return ret; 1180 return ret;
1188} 1181}
1189 1182
@@ -1221,18 +1214,16 @@ asmlinkage ssize_t
1221compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, 1214compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1222 unsigned long vlen) 1215 unsigned long vlen)
1223{ 1216{
1224 struct file *file; 1217 struct fd f = fdget(fd);
1225 int fput_needed;
1226 ssize_t ret; 1218 ssize_t ret;
1227 loff_t pos; 1219 loff_t pos;
1228 1220
1229 file = fget_light(fd, &fput_needed); 1221 if (!f.file)
1230 if (!file)
1231 return -EBADF; 1222 return -EBADF;
1232 pos = file->f_pos; 1223 pos = f.file->f_pos;
1233 ret = compat_writev(file, vec, vlen, &pos); 1224 ret = compat_writev(f.file, vec, vlen, &pos);
1234 file->f_pos = pos; 1225 f.file->f_pos = pos;
1235 fput_light(file, fput_needed); 1226 fdput(f);
1236 return ret; 1227 return ret;
1237} 1228}
1238 1229
@@ -1240,19 +1231,18 @@ asmlinkage ssize_t
1240compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, 1231compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,
1241 unsigned long vlen, loff_t pos) 1232 unsigned long vlen, loff_t pos)
1242{ 1233{
1243 struct file *file; 1234 struct fd f;
1244 int fput_needed;
1245 ssize_t ret; 1235 ssize_t ret;
1246 1236
1247 if (pos < 0) 1237 if (pos < 0)
1248 return -EINVAL; 1238 return -EINVAL;
1249 file = fget_light(fd, &fput_needed); 1239 f = fdget(fd);
1250 if (!file) 1240 if (!f.file)
1251 return -EBADF; 1241 return -EBADF;
1252 ret = -ESPIPE; 1242 ret = -ESPIPE;
1253 if (file->f_mode & FMODE_PWRITE) 1243 if (f.file->f_mode & FMODE_PWRITE)
1254 ret = compat_writev(file, vec, vlen, &pos); 1244 ret = compat_writev(f.file, vec, vlen, &pos);
1255 fput_light(file, fput_needed); 1245 fdput(f);
1256 return ret; 1246 return ret;
1257} 1247}
1258 1248
@@ -1802,3 +1792,25 @@ compat_sys_open_by_handle_at(int mountdirfd,
1802 return do_handle_open(mountdirfd, handle, flags); 1792 return do_handle_open(mountdirfd, handle, flags);
1803} 1793}
1804#endif 1794#endif
1795
1796#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
1797asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
1798 compat_off_t __user *offset, compat_size_t count)
1799{
1800 loff_t pos;
1801 off_t off;
1802 ssize_t ret;
1803
1804 if (offset) {
1805 if (unlikely(get_user(off, offset)))
1806 return -EFAULT;
1807 pos = off;
1808 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1809 if (unlikely(put_user(pos, offset)))
1810 return -EFAULT;
1811 return ret;
1812 }
1813
1814 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1815}
1816#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 112e45a17e99..a81147e2e4ef 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -38,6 +38,13 @@
38#define elf_addr_t Elf32_Addr 38#define elf_addr_t Elf32_Addr
39 39
40/* 40/*
41 * Some data types as stored in coredump.
42 */
43#define user_long_t compat_long_t
44#define user_siginfo_t compat_siginfo_t
45#define copy_siginfo_to_user copy_siginfo_to_user32
46
47/*
41 * The machine-dependent core note format types are defined in elfcore-compat.h, 48 * The machine-dependent core note format types are defined in elfcore-compat.h,
42 * which requires asm/elf.h to define compat_elf_gregset_t et al. 49 * which requires asm/elf.h to define compat_elf_gregset_t et al.
43 */ 50 */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index debdfe0fc809..f5054025f9da 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -866,6 +866,12 @@ COMPATIBLE_IOCTL(TIOCGPTN)
866COMPATIBLE_IOCTL(TIOCSPTLCK) 866COMPATIBLE_IOCTL(TIOCSPTLCK)
867COMPATIBLE_IOCTL(TIOCSERGETLSR) 867COMPATIBLE_IOCTL(TIOCSERGETLSR)
868COMPATIBLE_IOCTL(TIOCSIG) 868COMPATIBLE_IOCTL(TIOCSIG)
869#ifdef TIOCSRS485
870COMPATIBLE_IOCTL(TIOCSRS485)
871#endif
872#ifdef TIOCGRS485
873COMPATIBLE_IOCTL(TIOCGRS485)
874#endif
869#ifdef TCGETS2 875#ifdef TCGETS2
870COMPATIBLE_IOCTL(TCGETS2) 876COMPATIBLE_IOCTL(TCGETS2)
871COMPATIBLE_IOCTL(TCSETS2) 877COMPATIBLE_IOCTL(TCSETS2)
@@ -897,6 +903,8 @@ COMPATIBLE_IOCTL(KDGKBSENT)
897COMPATIBLE_IOCTL(KDSKBSENT) 903COMPATIBLE_IOCTL(KDSKBSENT)
898COMPATIBLE_IOCTL(KDGKBDIACR) 904COMPATIBLE_IOCTL(KDGKBDIACR)
899COMPATIBLE_IOCTL(KDSKBDIACR) 905COMPATIBLE_IOCTL(KDSKBDIACR)
906COMPATIBLE_IOCTL(KDGKBDIACRUC)
907COMPATIBLE_IOCTL(KDSKBDIACRUC)
900COMPATIBLE_IOCTL(KDKBDREP) 908COMPATIBLE_IOCTL(KDKBDREP)
901COMPATIBLE_IOCTL(KDGKBLED) 909COMPATIBLE_IOCTL(KDGKBLED)
902COMPATIBLE_IOCTL(KDGETLED) 910COMPATIBLE_IOCTL(KDGETLED)
@@ -1531,16 +1539,13 @@ static int compat_ioctl_check_table(unsigned int xcmd)
1531asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, 1539asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1532 unsigned long arg) 1540 unsigned long arg)
1533{ 1541{
1534 struct file *filp; 1542 struct fd f = fdget(fd);
1535 int error = -EBADF; 1543 int error = -EBADF;
1536 int fput_needed; 1544 if (!f.file)
1537
1538 filp = fget_light(fd, &fput_needed);
1539 if (!filp)
1540 goto out; 1545 goto out;
1541 1546
1542 /* RED-PEN how should LSM module know it's handling 32bit? */ 1547 /* RED-PEN how should LSM module know it's handling 32bit? */
1543 error = security_file_ioctl(filp, cmd, arg); 1548 error = security_file_ioctl(f.file, cmd, arg);
1544 if (error) 1549 if (error)
1545 goto out_fput; 1550 goto out_fput;
1546 1551
@@ -1560,30 +1565,30 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1560#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 1565#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
1561 case FS_IOC_RESVSP_32: 1566 case FS_IOC_RESVSP_32:
1562 case FS_IOC_RESVSP64_32: 1567 case FS_IOC_RESVSP64_32:
1563 error = compat_ioctl_preallocate(filp, compat_ptr(arg)); 1568 error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
1564 goto out_fput; 1569 goto out_fput;
1565#else 1570#else
1566 case FS_IOC_RESVSP: 1571 case FS_IOC_RESVSP:
1567 case FS_IOC_RESVSP64: 1572 case FS_IOC_RESVSP64:
1568 error = ioctl_preallocate(filp, compat_ptr(arg)); 1573 error = ioctl_preallocate(f.file, compat_ptr(arg));
1569 goto out_fput; 1574 goto out_fput;
1570#endif 1575#endif
1571 1576
1572 case FIBMAP: 1577 case FIBMAP:
1573 case FIGETBSZ: 1578 case FIGETBSZ:
1574 case FIONREAD: 1579 case FIONREAD:
1575 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 1580 if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))
1576 break; 1581 break;
1577 /*FALL THROUGH*/ 1582 /*FALL THROUGH*/
1578 1583
1579 default: 1584 default:
1580 if (filp->f_op && filp->f_op->compat_ioctl) { 1585 if (f.file->f_op && f.file->f_op->compat_ioctl) {
1581 error = filp->f_op->compat_ioctl(filp, cmd, arg); 1586 error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
1582 if (error != -ENOIOCTLCMD) 1587 if (error != -ENOIOCTLCMD)
1583 goto out_fput; 1588 goto out_fput;
1584 } 1589 }
1585 1590
1586 if (!filp->f_op || !filp->f_op->unlocked_ioctl) 1591 if (!f.file->f_op || !f.file->f_op->unlocked_ioctl)
1587 goto do_ioctl; 1592 goto do_ioctl;
1588 break; 1593 break;
1589 } 1594 }
@@ -1591,7 +1596,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1591 if (compat_ioctl_check_table(XFORM(cmd))) 1596 if (compat_ioctl_check_table(XFORM(cmd)))
1592 goto found_handler; 1597 goto found_handler;
1593 1598
1594 error = do_ioctl_trans(fd, cmd, arg, filp); 1599 error = do_ioctl_trans(fd, cmd, arg, f.file);
1595 if (error == -ENOIOCTLCMD) 1600 if (error == -ENOIOCTLCMD)
1596 error = -ENOTTY; 1601 error = -ENOTTY;
1597 1602
@@ -1600,9 +1605,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1600 found_handler: 1605 found_handler:
1601 arg = (unsigned long)compat_ptr(arg); 1606 arg = (unsigned long)compat_ptr(arg);
1602 do_ioctl: 1607 do_ioctl:
1603 error = do_vfs_ioctl(filp, fd, cmd, arg); 1608 error = do_vfs_ioctl(f.file, fd, cmd, arg);
1604 out_fput: 1609 out_fput:
1605 fput_light(filp, fput_needed); 1610 fdput(f);
1606 out: 1611 out:
1607 return error; 1612 return error;
1608} 1613}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0074362d9f7f..a9d35b0e06cf 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -79,8 +79,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
79 return -ENOMEM; 79 return -ENOMEM;
80 /* assign default attributes */ 80 /* assign default attributes */
81 sd_iattr->ia_mode = sd->s_mode; 81 sd_iattr->ia_mode = sd->s_mode;
82 sd_iattr->ia_uid = 0; 82 sd_iattr->ia_uid = GLOBAL_ROOT_UID;
83 sd_iattr->ia_gid = 0; 83 sd_iattr->ia_gid = GLOBAL_ROOT_GID;
84 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 84 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
85 sd->s_iattr = sd_iattr; 85 sd->s_iattr = sd_iattr;
86 } 86 }
diff --git a/fs/coredump.c b/fs/coredump.c
new file mode 100644
index 000000000000..ce47379bfa61
--- /dev/null
+++ b/fs/coredump.c
@@ -0,0 +1,693 @@
1#include <linux/slab.h>
2#include <linux/file.h>
3#include <linux/fdtable.h>
4#include <linux/mm.h>
5#include <linux/stat.h>
6#include <linux/fcntl.h>
7#include <linux/swap.h>
8#include <linux/string.h>
9#include <linux/init.h>
10#include <linux/pagemap.h>
11#include <linux/perf_event.h>
12#include <linux/highmem.h>
13#include <linux/spinlock.h>
14#include <linux/key.h>
15#include <linux/personality.h>
16#include <linux/binfmts.h>
17#include <linux/coredump.h>
18#include <linux/utsname.h>
19#include <linux/pid_namespace.h>
20#include <linux/module.h>
21#include <linux/namei.h>
22#include <linux/mount.h>
23#include <linux/security.h>
24#include <linux/syscalls.h>
25#include <linux/tsacct_kern.h>
26#include <linux/cn_proc.h>
27#include <linux/audit.h>
28#include <linux/tracehook.h>
29#include <linux/kmod.h>
30#include <linux/fsnotify.h>
31#include <linux/fs_struct.h>
32#include <linux/pipe_fs_i.h>
33#include <linux/oom.h>
34#include <linux/compat.h>
35
36#include <asm/uaccess.h>
37#include <asm/mmu_context.h>
38#include <asm/tlb.h>
39#include <asm/exec.h>
40
41#include <trace/events/task.h>
42#include "internal.h"
43#include "coredump.h"
44
45#include <trace/events/sched.h>
46
47int core_uses_pid;
48char core_pattern[CORENAME_MAX_SIZE] = "core";
49unsigned int core_pipe_limit;
50
51struct core_name {
52 char *corename;
53 int used, size;
54};
55static atomic_t call_count = ATOMIC_INIT(1);
56
57/* The maximal length of core_pattern is also specified in sysctl.c */
58
59static int expand_corename(struct core_name *cn)
60{
61 char *old_corename = cn->corename;
62
63 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
64 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
65
66 if (!cn->corename) {
67 kfree(old_corename);
68 return -ENOMEM;
69 }
70
71 return 0;
72}
73
74static int cn_printf(struct core_name *cn, const char *fmt, ...)
75{
76 char *cur;
77 int need;
78 int ret;
79 va_list arg;
80
81 va_start(arg, fmt);
82 need = vsnprintf(NULL, 0, fmt, arg);
83 va_end(arg);
84
85 if (likely(need < cn->size - cn->used - 1))
86 goto out_printf;
87
88 ret = expand_corename(cn);
89 if (ret)
90 goto expand_fail;
91
92out_printf:
93 cur = cn->corename + cn->used;
94 va_start(arg, fmt);
95 vsnprintf(cur, need + 1, fmt, arg);
96 va_end(arg);
97 cn->used += need;
98 return 0;
99
100expand_fail:
101 return ret;
102}
103
104static void cn_escape(char *str)
105{
106 for (; *str; str++)
107 if (*str == '/')
108 *str = '!';
109}
110
111static int cn_print_exe_file(struct core_name *cn)
112{
113 struct file *exe_file;
114 char *pathbuf, *path;
115 int ret;
116
117 exe_file = get_mm_exe_file(current->mm);
118 if (!exe_file) {
119 char *commstart = cn->corename + cn->used;
120 ret = cn_printf(cn, "%s (path unknown)", current->comm);
121 cn_escape(commstart);
122 return ret;
123 }
124
125 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
126 if (!pathbuf) {
127 ret = -ENOMEM;
128 goto put_exe_file;
129 }
130
131 path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
132 if (IS_ERR(path)) {
133 ret = PTR_ERR(path);
134 goto free_buf;
135 }
136
137 cn_escape(path);
138
139 ret = cn_printf(cn, "%s", path);
140
141free_buf:
142 kfree(pathbuf);
143put_exe_file:
144 fput(exe_file);
145 return ret;
146}
147
148/* format_corename will inspect the pattern parameter, and output a
149 * name into corename, which must have space for at least
150 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
151 */
152static int format_corename(struct core_name *cn, struct coredump_params *cprm)
153{
154 const struct cred *cred = current_cred();
155 const char *pat_ptr = core_pattern;
156 int ispipe = (*pat_ptr == '|');
157 int pid_in_pattern = 0;
158 int err = 0;
159
160 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
161 cn->corename = kmalloc(cn->size, GFP_KERNEL);
162 cn->used = 0;
163
164 if (!cn->corename)
165 return -ENOMEM;
166
167 /* Repeat as long as we have more pattern to process and more output
168 space */
169 while (*pat_ptr) {
170 if (*pat_ptr != '%') {
171 if (*pat_ptr == 0)
172 goto out;
173 err = cn_printf(cn, "%c", *pat_ptr++);
174 } else {
175 switch (*++pat_ptr) {
176 /* single % at the end, drop that */
177 case 0:
178 goto out;
179 /* Double percent, output one percent */
180 case '%':
181 err = cn_printf(cn, "%c", '%');
182 break;
183 /* pid */
184 case 'p':
185 pid_in_pattern = 1;
186 err = cn_printf(cn, "%d",
187 task_tgid_vnr(current));
188 break;
189 /* uid */
190 case 'u':
191 err = cn_printf(cn, "%d", cred->uid);
192 break;
193 /* gid */
194 case 'g':
195 err = cn_printf(cn, "%d", cred->gid);
196 break;
197 case 'd':
198 err = cn_printf(cn, "%d",
199 __get_dumpable(cprm->mm_flags));
200 break;
201 /* signal that caused the coredump */
202 case 's':
203 err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
204 break;
205 /* UNIX time of coredump */
206 case 't': {
207 struct timeval tv;
208 do_gettimeofday(&tv);
209 err = cn_printf(cn, "%lu", tv.tv_sec);
210 break;
211 }
212 /* hostname */
213 case 'h': {
214 char *namestart = cn->corename + cn->used;
215 down_read(&uts_sem);
216 err = cn_printf(cn, "%s",
217 utsname()->nodename);
218 up_read(&uts_sem);
219 cn_escape(namestart);
220 break;
221 }
222 /* executable */
223 case 'e': {
224 char *commstart = cn->corename + cn->used;
225 err = cn_printf(cn, "%s", current->comm);
226 cn_escape(commstart);
227 break;
228 }
229 case 'E':
230 err = cn_print_exe_file(cn);
231 break;
232 /* core limit size */
233 case 'c':
234 err = cn_printf(cn, "%lu",
235 rlimit(RLIMIT_CORE));
236 break;
237 default:
238 break;
239 }
240 ++pat_ptr;
241 }
242
243 if (err)
244 return err;
245 }
246
247 /* Backward compatibility with core_uses_pid:
248 *
249 * If core_pattern does not include a %p (as is the default)
250 * and core_uses_pid is set, then .%pid will be appended to
251 * the filename. Do not do this for piped commands. */
252 if (!ispipe && !pid_in_pattern && core_uses_pid) {
253 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
254 if (err)
255 return err;
256 }
257out:
258 return ispipe;
259}
260
261static int zap_process(struct task_struct *start, int exit_code)
262{
263 struct task_struct *t;
264 int nr = 0;
265
266 start->signal->flags = SIGNAL_GROUP_EXIT;
267 start->signal->group_exit_code = exit_code;
268 start->signal->group_stop_count = 0;
269
270 t = start;
271 do {
272 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
273 if (t != current && t->mm) {
274 sigaddset(&t->pending.signal, SIGKILL);
275 signal_wake_up(t, 1);
276 nr++;
277 }
278 } while_each_thread(start, t);
279
280 return nr;
281}
282
283static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
284 struct core_state *core_state, int exit_code)
285{
286 struct task_struct *g, *p;
287 unsigned long flags;
288 int nr = -EAGAIN;
289
290 spin_lock_irq(&tsk->sighand->siglock);
291 if (!signal_group_exit(tsk->signal)) {
292 mm->core_state = core_state;
293 nr = zap_process(tsk, exit_code);
294 }
295 spin_unlock_irq(&tsk->sighand->siglock);
296 if (unlikely(nr < 0))
297 return nr;
298
299 if (atomic_read(&mm->mm_users) == nr + 1)
300 goto done;
301 /*
302 * We should find and kill all tasks which use this mm, and we should
303 * count them correctly into ->nr_threads. We don't take tasklist
304 * lock, but this is safe wrt:
305 *
306 * fork:
307 * None of sub-threads can fork after zap_process(leader). All
308 * processes which were created before this point should be
309 * visible to zap_threads() because copy_process() adds the new
310 * process to the tail of init_task.tasks list, and lock/unlock
311 * of ->siglock provides a memory barrier.
312 *
313 * do_exit:
314 * The caller holds mm->mmap_sem. This means that the task which
315 * uses this mm can't pass exit_mm(), so it can't exit or clear
316 * its ->mm.
317 *
318 * de_thread:
319 * It does list_replace_rcu(&leader->tasks, &current->tasks),
320 * we must see either old or new leader, this does not matter.
321 * However, it can change p->sighand, so lock_task_sighand(p)
322 * must be used. Since p->mm != NULL and we hold ->mmap_sem
323 * it can't fail.
324 *
325 * Note also that "g" can be the old leader with ->mm == NULL
326 * and already unhashed and thus removed from ->thread_group.
327 * This is OK, __unhash_process()->list_del_rcu() does not
328 * clear the ->next pointer, we will find the new leader via
329 * next_thread().
330 */
331 rcu_read_lock();
332 for_each_process(g) {
333 if (g == tsk->group_leader)
334 continue;
335 if (g->flags & PF_KTHREAD)
336 continue;
337 p = g;
338 do {
339 if (p->mm) {
340 if (unlikely(p->mm == mm)) {
341 lock_task_sighand(p, &flags);
342 nr += zap_process(p, exit_code);
343 unlock_task_sighand(p, &flags);
344 }
345 break;
346 }
347 } while_each_thread(g, p);
348 }
349 rcu_read_unlock();
350done:
351 atomic_set(&core_state->nr_threads, nr);
352 return nr;
353}
354
355static int coredump_wait(int exit_code, struct core_state *core_state)
356{
357 struct task_struct *tsk = current;
358 struct mm_struct *mm = tsk->mm;
359 int core_waiters = -EBUSY;
360
361 init_completion(&core_state->startup);
362 core_state->dumper.task = tsk;
363 core_state->dumper.next = NULL;
364
365 down_write(&mm->mmap_sem);
366 if (!mm->core_state)
367 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
368 up_write(&mm->mmap_sem);
369
370 if (core_waiters > 0) {
371 struct core_thread *ptr;
372
373 wait_for_completion(&core_state->startup);
374 /*
375 * Wait for all the threads to become inactive, so that
376 * all the thread context (extended register state, like
377 * fpu etc) gets copied to the memory.
378 */
379 ptr = core_state->dumper.next;
380 while (ptr != NULL) {
381 wait_task_inactive(ptr->task, 0);
382 ptr = ptr->next;
383 }
384 }
385
386 return core_waiters;
387}
388
389static void coredump_finish(struct mm_struct *mm)
390{
391 struct core_thread *curr, *next;
392 struct task_struct *task;
393
394 next = mm->core_state->dumper.next;
395 while ((curr = next) != NULL) {
396 next = curr->next;
397 task = curr->task;
398 /*
399 * see exit_mm(), curr->task must not see
400 * ->task == NULL before we read ->next.
401 */
402 smp_mb();
403 curr->task = NULL;
404 wake_up_process(task);
405 }
406
407 mm->core_state = NULL;
408}
409
410static void wait_for_dump_helpers(struct file *file)
411{
412 struct pipe_inode_info *pipe;
413
414 pipe = file->f_path.dentry->d_inode->i_pipe;
415
416 pipe_lock(pipe);
417 pipe->readers++;
418 pipe->writers--;
419
420 while ((pipe->readers > 1) && (!signal_pending(current))) {
421 wake_up_interruptible_sync(&pipe->wait);
422 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
423 pipe_wait(pipe);
424 }
425
426 pipe->readers--;
427 pipe->writers++;
428 pipe_unlock(pipe);
429
430}
431
432/*
433 * umh_pipe_setup
434 * helper function to customize the process used
435 * to collect the core in userspace. Specifically
436 * it sets up a pipe and installs it as fd 0 (stdin)
437 * for the process. Returns 0 on success, or
438 * PTR_ERR on failure.
439 * Note that it also sets the core limit to 1. This
440 * is a special value that we use to trap recursive
441 * core dumps
442 */
443static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
444{
445 struct file *files[2];
446 struct coredump_params *cp = (struct coredump_params *)info->data;
447 int err = create_pipe_files(files, 0);
448 if (err)
449 return err;
450
451 cp->file = files[1];
452
453 err = replace_fd(0, files[0], 0);
454 fput(files[0]);
455 /* and disallow core files too */
456 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
457
458 return err;
459}
460
461void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
462{
463 struct core_state core_state;
464 struct core_name cn;
465 struct mm_struct *mm = current->mm;
466 struct linux_binfmt * binfmt;
467 const struct cred *old_cred;
468 struct cred *cred;
469 int retval = 0;
470 int flag = 0;
471 int ispipe;
472 struct files_struct *displaced;
473 bool need_nonrelative = false;
474 static atomic_t core_dump_count = ATOMIC_INIT(0);
475 struct coredump_params cprm = {
476 .siginfo = siginfo,
477 .regs = regs,
478 .limit = rlimit(RLIMIT_CORE),
479 /*
480 * We must use the same mm->flags while dumping core to avoid
481 * inconsistency of bit flags, since this flag is not protected
482 * by any locks.
483 */
484 .mm_flags = mm->flags,
485 };
486
487 audit_core_dumps(siginfo->si_signo);
488
489 binfmt = mm->binfmt;
490 if (!binfmt || !binfmt->core_dump)
491 goto fail;
492 if (!__get_dumpable(cprm.mm_flags))
493 goto fail;
494
495 cred = prepare_creds();
496 if (!cred)
497 goto fail;
498 /*
499 * We cannot trust fsuid as being the "true" uid of the process
500 * nor do we know its entire history. We only know it was tainted
501 * so we dump it as root in mode 2, and only into a controlled
502 * environment (pipe handler or fully qualified path).
503 */
504 if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
505 /* Setuid core dump mode */
506 flag = O_EXCL; /* Stop rewrite attacks */
507 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
508 need_nonrelative = true;
509 }
510
511 retval = coredump_wait(siginfo->si_signo, &core_state);
512 if (retval < 0)
513 goto fail_creds;
514
515 old_cred = override_creds(cred);
516
517 /*
518 * Clear any false indication of pending signals that might
519 * be seen by the filesystem code called to write the core file.
520 */
521 clear_thread_flag(TIF_SIGPENDING);
522
523 ispipe = format_corename(&cn, &cprm);
524
525 if (ispipe) {
526 int dump_count;
527 char **helper_argv;
528
529 if (ispipe < 0) {
530 printk(KERN_WARNING "format_corename failed\n");
531 printk(KERN_WARNING "Aborting core\n");
532 goto fail_corename;
533 }
534
535 if (cprm.limit == 1) {
536 /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
537 *
538 * Normally core limits are irrelevant to pipes, since
539 * we're not writing to the file system, but we use
540 * cprm.limit of 1 here as a speacial value, this is a
541 * consistent way to catch recursive crashes.
542 * We can still crash if the core_pattern binary sets
543 * RLIM_CORE = !1, but it runs as root, and can do
544 * lots of stupid things.
545 *
546 * Note that we use task_tgid_vnr here to grab the pid
547 * of the process group leader. That way we get the
548 * right pid if a thread in a multi-threaded
549 * core_pattern process dies.
550 */
551 printk(KERN_WARNING
552 "Process %d(%s) has RLIMIT_CORE set to 1\n",
553 task_tgid_vnr(current), current->comm);
554 printk(KERN_WARNING "Aborting core\n");
555 goto fail_unlock;
556 }
557 cprm.limit = RLIM_INFINITY;
558
559 dump_count = atomic_inc_return(&core_dump_count);
560 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
561 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
562 task_tgid_vnr(current), current->comm);
563 printk(KERN_WARNING "Skipping core dump\n");
564 goto fail_dropcount;
565 }
566
567 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
568 if (!helper_argv) {
569 printk(KERN_WARNING "%s failed to allocate memory\n",
570 __func__);
571 goto fail_dropcount;
572 }
573
574 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
575 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
576 NULL, &cprm);
577 argv_free(helper_argv);
578 if (retval) {
579 printk(KERN_INFO "Core dump to %s pipe failed\n",
580 cn.corename);
581 goto close_fail;
582 }
583 } else {
584 struct inode *inode;
585
586 if (cprm.limit < binfmt->min_coredump)
587 goto fail_unlock;
588
589 if (need_nonrelative && cn.corename[0] != '/') {
590 printk(KERN_WARNING "Pid %d(%s) can only dump core "\
591 "to fully qualified path!\n",
592 task_tgid_vnr(current), current->comm);
593 printk(KERN_WARNING "Skipping core dump\n");
594 goto fail_unlock;
595 }
596
597 cprm.file = filp_open(cn.corename,
598 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
599 0600);
600 if (IS_ERR(cprm.file))
601 goto fail_unlock;
602
603 inode = cprm.file->f_path.dentry->d_inode;
604 if (inode->i_nlink > 1)
605 goto close_fail;
606 if (d_unhashed(cprm.file->f_path.dentry))
607 goto close_fail;
608 /*
609 * AK: actually i see no reason to not allow this for named
610 * pipes etc, but keep the previous behaviour for now.
611 */
612 if (!S_ISREG(inode->i_mode))
613 goto close_fail;
614 /*
615 * Dont allow local users get cute and trick others to coredump
616 * into their pre-created files.
617 */
618 if (!uid_eq(inode->i_uid, current_fsuid()))
619 goto close_fail;
620 if (!cprm.file->f_op || !cprm.file->f_op->write)
621 goto close_fail;
622 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
623 goto close_fail;
624 }
625
626 /* get us an unshared descriptor table; almost always a no-op */
627 retval = unshare_files(&displaced);
628 if (retval)
629 goto close_fail;
630 if (displaced)
631 put_files_struct(displaced);
632 retval = binfmt->core_dump(&cprm);
633 if (retval)
634 current->signal->group_exit_code |= 0x80;
635
636 if (ispipe && core_pipe_limit)
637 wait_for_dump_helpers(cprm.file);
638close_fail:
639 if (cprm.file)
640 filp_close(cprm.file, NULL);
641fail_dropcount:
642 if (ispipe)
643 atomic_dec(&core_dump_count);
644fail_unlock:
645 kfree(cn.corename);
646fail_corename:
647 coredump_finish(mm);
648 revert_creds(old_cred);
649fail_creds:
650 put_cred(cred);
651fail:
652 return;
653}
654
655/*
656 * Core dumping helper functions. These are the only things you should
657 * do on a core-file: use only these functions to write out all the
658 * necessary info.
659 */
660int dump_write(struct file *file, const void *addr, int nr)
661{
662 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
663}
664EXPORT_SYMBOL(dump_write);
665
666int dump_seek(struct file *file, loff_t off)
667{
668 int ret = 1;
669
670 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
671 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
672 return 0;
673 } else {
674 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
675
676 if (!buf)
677 return 0;
678 while (off > 0) {
679 unsigned long n = off;
680
681 if (n > PAGE_SIZE)
682 n = PAGE_SIZE;
683 if (!dump_write(file, buf, n)) {
684 ret = 0;
685 break;
686 }
687 off -= n;
688 }
689 free_page((unsigned long)buf);
690 }
691 return ret;
692}
693EXPORT_SYMBOL(dump_seek);
diff --git a/fs/coredump.h b/fs/coredump.h
new file mode 100644
index 000000000000..e39ff072110d
--- /dev/null
+++ b/fs/coredump.h
@@ -0,0 +1,6 @@
1#ifndef _FS_COREDUMP_H
2#define _FS_COREDUMP_H
3
4extern int __get_dumpable(unsigned long mm_flags);
5
6#endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 28cca01ca9c9..c6c3f91ecf06 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -90,8 +90,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
90 } 90 }
91 91
92 inode->i_mode = cramfs_inode->mode; 92 inode->i_mode = cramfs_inode->mode;
93 inode->i_uid = cramfs_inode->uid; 93 i_uid_write(inode, cramfs_inode->uid);
94 inode->i_gid = cramfs_inode->gid; 94 i_gid_write(inode, cramfs_inode->gid);
95 95
96 /* if the lower 2 bits are zero, the inode contains data */ 96 /* if the lower 2 bits are zero, the inode contains data */
97 if (!(inode->i_ino & 3)) { 97 if (!(inode->i_ino & 3)) {
diff --git a/fs/dcache.c b/fs/dcache.c
index 8086636bf796..3a463d0c4fe8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -389,7 +389,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
389 * Inform try_to_ascend() that we are no longer attached to the 389 * Inform try_to_ascend() that we are no longer attached to the
390 * dentry tree 390 * dentry tree
391 */ 391 */
392 dentry->d_flags |= DCACHE_DISCONNECTED; 392 dentry->d_flags |= DCACHE_DENTRY_KILLED;
393 if (parent) 393 if (parent)
394 spin_unlock(&parent->d_lock); 394 spin_unlock(&parent->d_lock);
395 dentry_iput(dentry); 395 dentry_iput(dentry);
@@ -1048,7 +1048,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
1048 * or deletion 1048 * or deletion
1049 */ 1049 */
1050 if (new != old->d_parent || 1050 if (new != old->d_parent ||
1051 (old->d_flags & DCACHE_DISCONNECTED) || 1051 (old->d_flags & DCACHE_DENTRY_KILLED) ||
1052 (!locked && read_seqretry(&rename_lock, seq))) { 1052 (!locked && read_seqretry(&rename_lock, seq))) {
1053 spin_unlock(&new->d_lock); 1053 spin_unlock(&new->d_lock);
1054 new = NULL; 1054 new = NULL;
@@ -1134,6 +1134,8 @@ positive:
1134 return 1; 1134 return 1;
1135 1135
1136rename_retry: 1136rename_retry:
1137 if (locked)
1138 goto again;
1137 locked = 1; 1139 locked = 1;
1138 write_seqlock(&rename_lock); 1140 write_seqlock(&rename_lock);
1139 goto again; 1141 goto again;
@@ -1141,7 +1143,7 @@ rename_retry:
1141EXPORT_SYMBOL(have_submounts); 1143EXPORT_SYMBOL(have_submounts);
1142 1144
1143/* 1145/*
1144 * Search the dentry child list for the specified parent, 1146 * Search the dentry child list of the specified parent,
1145 * and move any unused dentries to the end of the unused 1147 * and move any unused dentries to the end of the unused
1146 * list for prune_dcache(). We descend to the next level 1148 * list for prune_dcache(). We descend to the next level
1147 * whenever the d_subdirs list is non-empty and continue 1149 * whenever the d_subdirs list is non-empty and continue
@@ -1236,6 +1238,8 @@ out:
1236rename_retry: 1238rename_retry:
1237 if (found) 1239 if (found)
1238 return found; 1240 return found;
1241 if (locked)
1242 goto again;
1239 locked = 1; 1243 locked = 1;
1240 write_seqlock(&rename_lock); 1244 write_seqlock(&rename_lock);
1241 goto again; 1245 goto again;
@@ -2109,7 +2113,7 @@ again:
2109 inode = dentry->d_inode; 2113 inode = dentry->d_inode;
2110 isdir = S_ISDIR(inode->i_mode); 2114 isdir = S_ISDIR(inode->i_mode);
2111 if (dentry->d_count == 1) { 2115 if (dentry->d_count == 1) {
2112 if (inode && !spin_trylock(&inode->i_lock)) { 2116 if (!spin_trylock(&inode->i_lock)) {
2113 spin_unlock(&dentry->d_lock); 2117 spin_unlock(&dentry->d_lock);
2114 cpu_relax(); 2118 cpu_relax();
2115 goto again; 2119 goto again;
@@ -3035,6 +3039,8 @@ resume:
3035 return; 3039 return;
3036 3040
3037rename_retry: 3041rename_retry:
3042 if (locked)
3043 goto again;
3038 locked = 1; 3044 locked = 1;
3039 write_seqlock(&rename_lock); 3045 write_seqlock(&rename_lock);
3040 goto again; 3046 goto again;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2340f6978d6e..c5ca6ae5a30c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -526,73 +526,51 @@ struct array_data {
526 u32 elements; 526 u32 elements;
527}; 527};
528 528
529static int u32_array_open(struct inode *inode, struct file *file) 529static size_t u32_format_array(char *buf, size_t bufsize,
530{ 530 u32 *array, int array_size)
531 file->private_data = NULL;
532 return nonseekable_open(inode, file);
533}
534
535static size_t format_array(char *buf, size_t bufsize, const char *fmt,
536 u32 *array, u32 array_size)
537{ 531{
538 size_t ret = 0; 532 size_t ret = 0;
539 u32 i;
540 533
541 for (i = 0; i < array_size; i++) { 534 while (--array_size >= 0) {
542 size_t len; 535 size_t len;
536 char term = array_size ? ' ' : '\n';
543 537
544 len = snprintf(buf, bufsize, fmt, array[i]); 538 len = snprintf(buf, bufsize, "%u%c", *array++, term);
545 len++; /* ' ' or '\n' */
546 ret += len; 539 ret += len;
547 540
548 if (buf) { 541 buf += len;
549 buf += len; 542 bufsize -= len;
550 bufsize -= len;
551 buf[-1] = (i == array_size-1) ? '\n' : ' ';
552 }
553 } 543 }
554
555 ret++; /* \0 */
556 if (buf)
557 *buf = '\0';
558
559 return ret; 544 return ret;
560} 545}
561 546
562static char *format_array_alloc(const char *fmt, u32 *array, 547static int u32_array_open(struct inode *inode, struct file *file)
563 u32 array_size)
564{ 548{
565 size_t len = format_array(NULL, 0, fmt, array, array_size); 549 struct array_data *data = inode->i_private;
566 char *ret; 550 int size, elements = data->elements;
567 551 char *buf;
568 ret = kmalloc(len, GFP_KERNEL); 552
569 if (ret == NULL) 553 /*
570 return NULL; 554 * Max size:
555 * - 10 digits + ' '/'\n' = 11 bytes per number
556 * - terminating NUL character
557 */
558 size = elements*11;
559 buf = kmalloc(size+1, GFP_KERNEL);
560 if (!buf)
561 return -ENOMEM;
562 buf[size] = 0;
563
564 file->private_data = buf;
565 u32_format_array(buf, size, data->array, data->elements);
571 566
572 format_array(ret, len, fmt, array, array_size); 567 return nonseekable_open(inode, file);
573 return ret;
574} 568}
575 569
576static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, 570static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
577 loff_t *ppos) 571 loff_t *ppos)
578{ 572{
579 struct inode *inode = file->f_path.dentry->d_inode; 573 size_t size = strlen(file->private_data);
580 struct array_data *data = inode->i_private;
581 size_t size;
582
583 if (*ppos == 0) {
584 if (file->private_data) {
585 kfree(file->private_data);
586 file->private_data = NULL;
587 }
588
589 file->private_data = format_array_alloc("%u", data->array,
590 data->elements);
591 }
592
593 size = 0;
594 if (file->private_data)
595 size = strlen(file->private_data);
596 574
597 return simple_read_from_buffer(buf, len, ppos, 575 return simple_read_from_buffer(buf, len, ppos,
598 file->private_data, size); 576 file->private_data, size);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4733eab34a23..b607d92cdf24 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -28,7 +28,7 @@
28#include <linux/magic.h> 28#include <linux/magic.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30 30
31#define DEBUGFS_DEFAULT_MODE 0755 31#define DEBUGFS_DEFAULT_MODE 0700
32 32
33static struct vfsmount *debugfs_mount; 33static struct vfsmount *debugfs_mount;
34static int debugfs_mount_count; 34static int debugfs_mount_count;
@@ -128,8 +128,8 @@ static inline int debugfs_positive(struct dentry *dentry)
128} 128}
129 129
130struct debugfs_mount_opts { 130struct debugfs_mount_opts {
131 uid_t uid; 131 kuid_t uid;
132 gid_t gid; 132 kgid_t gid;
133 umode_t mode; 133 umode_t mode;
134}; 134};
135 135
@@ -156,6 +156,8 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
156 substring_t args[MAX_OPT_ARGS]; 156 substring_t args[MAX_OPT_ARGS];
157 int option; 157 int option;
158 int token; 158 int token;
159 kuid_t uid;
160 kgid_t gid;
159 char *p; 161 char *p;
160 162
161 opts->mode = DEBUGFS_DEFAULT_MODE; 163 opts->mode = DEBUGFS_DEFAULT_MODE;
@@ -169,12 +171,18 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
169 case Opt_uid: 171 case Opt_uid:
170 if (match_int(&args[0], &option)) 172 if (match_int(&args[0], &option))
171 return -EINVAL; 173 return -EINVAL;
172 opts->uid = option; 174 uid = make_kuid(current_user_ns(), option);
175 if (!uid_valid(uid))
176 return -EINVAL;
177 opts->uid = uid;
173 break; 178 break;
174 case Opt_gid: 179 case Opt_gid:
175 if (match_octal(&args[0], &option)) 180 if (match_octal(&args[0], &option))
176 return -EINVAL; 181 return -EINVAL;
177 opts->gid = option; 182 gid = make_kgid(current_user_ns(), option);
183 if (!gid_valid(gid))
184 return -EINVAL;
185 opts->gid = gid;
178 break; 186 break;
179 case Opt_mode: 187 case Opt_mode:
180 if (match_octal(&args[0], &option)) 188 if (match_octal(&args[0], &option))
@@ -226,10 +234,12 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
226 struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; 234 struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;
227 struct debugfs_mount_opts *opts = &fsi->mount_opts; 235 struct debugfs_mount_opts *opts = &fsi->mount_opts;
228 236
229 if (opts->uid != 0) 237 if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
230 seq_printf(m, ",uid=%u", opts->uid); 238 seq_printf(m, ",uid=%u",
231 if (opts->gid != 0) 239 from_kuid_munged(&init_user_ns, opts->uid));
232 seq_printf(m, ",gid=%u", opts->gid); 240 if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
241 seq_printf(m, ",gid=%u",
242 from_kgid_munged(&init_user_ns, opts->gid));
233 if (opts->mode != DEBUGFS_DEFAULT_MODE) 243 if (opts->mode != DEBUGFS_DEFAULT_MODE)
234 seq_printf(m, ",mode=%o", opts->mode); 244 seq_printf(m, ",mode=%o", opts->mode);
235 245
@@ -291,9 +301,9 @@ static struct file_system_type debug_fs_type = {
291 .kill_sb = kill_litter_super, 301 .kill_sb = kill_litter_super,
292}; 302};
293 303
294struct dentry *__create_file(const char *name, umode_t mode, 304static struct dentry *__create_file(const char *name, umode_t mode,
295 struct dentry *parent, void *data, 305 struct dentry *parent, void *data,
296 const struct file_operations *fops) 306 const struct file_operations *fops)
297{ 307{
298 struct dentry *dentry = NULL; 308 struct dentry *dentry = NULL;
299 int error; 309 int error;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 63dc19c54d5a..27a6ba9aaeec 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -15,8 +15,8 @@
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17 17
18static uint64_t dlm_cb_seq; 18static uint64_t dlm_cb_seq;
19static spinlock_t dlm_cb_seq_spin; 19static DEFINE_SPINLOCK(dlm_cb_seq_spin);
20 20
21static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) 21static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
22{ 22{
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 9ccf7346834a..a0387dd8b1f0 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -750,6 +750,7 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
750static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) 750static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
751{ 751{
752 struct sockaddr_storage *addr; 752 struct sockaddr_storage *addr;
753 int rv;
753 754
754 if (len != sizeof(struct sockaddr_storage)) 755 if (len != sizeof(struct sockaddr_storage))
755 return -EINVAL; 756 return -EINVAL;
@@ -762,6 +763,13 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
762 return -ENOMEM; 763 return -ENOMEM;
763 764
764 memcpy(addr, buf, len); 765 memcpy(addr, buf, len);
766
767 rv = dlm_lowcomms_addr(cm->nodeid, addr, len);
768 if (rv) {
769 kfree(addr);
770 return rv;
771 }
772
765 cm->addr[cm->addr_count++] = addr; 773 cm->addr[cm->addr_count++] = addr;
766 return len; 774 return len;
767} 775}
@@ -878,34 +886,7 @@ static void put_space(struct dlm_space *sp)
878 config_item_put(&sp->group.cg_item); 886 config_item_put(&sp->group.cg_item);
879} 887}
880 888
881static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) 889static struct dlm_comm *get_comm(int nodeid)
882{
883 switch (x->ss_family) {
884 case AF_INET: {
885 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
886 struct sockaddr_in *siny = (struct sockaddr_in *)y;
887 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
888 return 0;
889 if (sinx->sin_port != siny->sin_port)
890 return 0;
891 break;
892 }
893 case AF_INET6: {
894 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
895 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
896 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
897 return 0;
898 if (sinx->sin6_port != siny->sin6_port)
899 return 0;
900 break;
901 }
902 default:
903 return 0;
904 }
905 return 1;
906}
907
908static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
909{ 890{
910 struct config_item *i; 891 struct config_item *i;
911 struct dlm_comm *cm = NULL; 892 struct dlm_comm *cm = NULL;
@@ -919,19 +900,11 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
919 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 900 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
920 cm = config_item_to_comm(i); 901 cm = config_item_to_comm(i);
921 902
922 if (nodeid) { 903 if (cm->nodeid != nodeid)
923 if (cm->nodeid != nodeid) 904 continue;
924 continue; 905 found = 1;
925 found = 1; 906 config_item_get(i);
926 config_item_get(i); 907 break;
927 break;
928 } else {
929 if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
930 continue;
931 found = 1;
932 config_item_get(i);
933 break;
934 }
935 } 908 }
936 mutex_unlock(&clusters_root.subsys.su_mutex); 909 mutex_unlock(&clusters_root.subsys.su_mutex);
937 910
@@ -995,7 +968,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
995 968
996int dlm_comm_seq(int nodeid, uint32_t *seq) 969int dlm_comm_seq(int nodeid, uint32_t *seq)
997{ 970{
998 struct dlm_comm *cm = get_comm(nodeid, NULL); 971 struct dlm_comm *cm = get_comm(nodeid);
999 if (!cm) 972 if (!cm)
1000 return -EEXIST; 973 return -EEXIST;
1001 *seq = cm->seq; 974 *seq = cm->seq;
@@ -1003,28 +976,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
1003 return 0; 976 return 0;
1004} 977}
1005 978
1006int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
1007{
1008 struct dlm_comm *cm = get_comm(nodeid, NULL);
1009 if (!cm)
1010 return -EEXIST;
1011 if (!cm->addr_count)
1012 return -ENOENT;
1013 memcpy(addr, cm->addr[0], sizeof(*addr));
1014 put_comm(cm);
1015 return 0;
1016}
1017
1018int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
1019{
1020 struct dlm_comm *cm = get_comm(0, addr);
1021 if (!cm)
1022 return -EEXIST;
1023 *nodeid = cm->nodeid;
1024 put_comm(cm);
1025 return 0;
1026}
1027
1028int dlm_our_nodeid(void) 979int dlm_our_nodeid(void)
1029{ 980{
1030 return local_comm ? local_comm->nodeid : 0; 981 return local_comm ? local_comm->nodeid : 0;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index dbd35a08f3a5..f30697bc2780 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -46,8 +46,6 @@ void dlm_config_exit(void);
46int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, 46int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
47 int *count_out); 47 int *count_out);
48int dlm_comm_seq(int nodeid, uint32_t *seq); 48int dlm_comm_seq(int nodeid, uint32_t *seq);
49int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
50int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
51int dlm_our_nodeid(void); 49int dlm_our_nodeid(void);
52int dlm_our_addr(struct sockaddr_storage *addr, int num); 50int dlm_our_addr(struct sockaddr_storage *addr, int num);
53 51
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 9d3e485f88c8..871c1abf6029 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -604,6 +604,7 @@ struct dlm_ls {
604 struct idr ls_recover_idr; 604 struct idr ls_recover_idr;
605 spinlock_t ls_recover_idr_lock; 605 spinlock_t ls_recover_idr_lock;
606 wait_queue_head_t ls_wait_general; 606 wait_queue_head_t ls_wait_general;
607 wait_queue_head_t ls_recover_lock_wait;
607 struct mutex ls_clear_proc_locks; 608 struct mutex ls_clear_proc_locks;
608 609
609 struct list_head ls_root_list; /* root resources */ 610 struct list_head ls_root_list; /* root resources */
@@ -616,15 +617,40 @@ struct dlm_ls {
616 char ls_name[1]; 617 char ls_name[1];
617}; 618};
618 619
619#define LSFL_WORK 0 620/*
620#define LSFL_RUNNING 1 621 * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines
621#define LSFL_RECOVERY_STOP 2 622 * that they should abort what they're doing so new recovery can be started.
622#define LSFL_RCOM_READY 3 623 *
623#define LSFL_RCOM_WAIT 4 624 * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it
624#define LSFL_UEVENT_WAIT 5 625 * should do down_write() on the in_recovery rw_semaphore. (doing down_write
625#define LSFL_TIMEWARN 6 626 * within dlm_ls_stop causes complaints about the lock acquired/released
626#define LSFL_CB_DELAY 7 627 * in different contexts.)
627#define LSFL_NODIR 8 628 *
629 * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore.
630 * It sets this after it is done with down_write() on the in_recovery
631 * rw_semaphore and clears it after it has released the rw_semaphore.
632 *
633 * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it
634 * should begin recovery of the lockspace.
635 *
636 * LSFL_RUNNING - set when normal locking activity is enabled.
637 * dlm_ls_stop() clears this to tell dlm locking routines that they should
638 * quit what they are doing so recovery can run. dlm_recoverd sets
639 * this after recovery is finished.
640 */
641
642#define LSFL_RECOVER_STOP 0
643#define LSFL_RECOVER_DOWN 1
644#define LSFL_RECOVER_LOCK 2
645#define LSFL_RECOVER_WORK 3
646#define LSFL_RUNNING 4
647
648#define LSFL_RCOM_READY 5
649#define LSFL_RCOM_WAIT 6
650#define LSFL_UEVENT_WAIT 7
651#define LSFL_TIMEWARN 8
652#define LSFL_CB_DELAY 9
653#define LSFL_NODIR 10
628 654
629/* much of this is just saving user space pointers associated with the 655/* much of this is just saving user space pointers associated with the
630 lock that we pass back to the user lib with an ast */ 656 lock that we pass back to the user lib with an ast */
@@ -667,7 +693,7 @@ static inline int dlm_locking_stopped(struct dlm_ls *ls)
667 693
668static inline int dlm_recovery_stopped(struct dlm_ls *ls) 694static inline int dlm_recovery_stopped(struct dlm_ls *ls)
669{ 695{
670 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 696 return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
671} 697}
672 698
673static inline int dlm_no_directory(struct dlm_ls *ls) 699static inline int dlm_no_directory(struct dlm_ls *ls)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 952557d00ccd..2e99fb0c9737 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -582,8 +582,6 @@ static int new_lockspace(const char *name, const char *cluster,
582 INIT_LIST_HEAD(&ls->ls_root_list); 582 INIT_LIST_HEAD(&ls->ls_root_list);
583 init_rwsem(&ls->ls_root_sem); 583 init_rwsem(&ls->ls_root_sem);
584 584
585 down_write(&ls->ls_in_recovery);
586
587 spin_lock(&lslist_lock); 585 spin_lock(&lslist_lock);
588 ls->ls_create_count = 1; 586 ls->ls_create_count = 1;
589 list_add(&ls->ls_list, &lslist); 587 list_add(&ls->ls_list, &lslist);
@@ -597,13 +595,24 @@ static int new_lockspace(const char *name, const char *cluster,
597 } 595 }
598 } 596 }
599 597
600 /* needs to find ls in lslist */ 598 init_waitqueue_head(&ls->ls_recover_lock_wait);
599
600 /*
601 * Once started, dlm_recoverd first looks for ls in lslist, then
602 * initializes ls_in_recovery as locked in "down" mode. We need
603 * to wait for the wakeup from dlm_recoverd because in_recovery
604 * has to start out in down mode.
605 */
606
601 error = dlm_recoverd_start(ls); 607 error = dlm_recoverd_start(ls);
602 if (error) { 608 if (error) {
603 log_error(ls, "can't start dlm_recoverd %d", error); 609 log_error(ls, "can't start dlm_recoverd %d", error);
604 goto out_callback; 610 goto out_callback;
605 } 611 }
606 612
613 wait_event(ls->ls_recover_lock_wait,
614 test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
615
607 ls->ls_kobj.kset = dlm_kset; 616 ls->ls_kobj.kset = dlm_kset;
608 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, 617 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
609 "%s", ls->ls_name); 618 "%s", ls->ls_name);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5c1b0e38c7a4..331ea4f94efd 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -140,6 +140,16 @@ struct writequeue_entry {
140 struct connection *con; 140 struct connection *con;
141}; 141};
142 142
143struct dlm_node_addr {
144 struct list_head list;
145 int nodeid;
146 int addr_count;
147 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
148};
149
150static LIST_HEAD(dlm_node_addrs);
151static DEFINE_SPINLOCK(dlm_node_addrs_spin);
152
143static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; 153static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
144static int dlm_local_count; 154static int dlm_local_count;
145static int dlm_allow_conn; 155static int dlm_allow_conn;
@@ -264,31 +274,146 @@ static struct connection *assoc2con(int assoc_id)
264 return NULL; 274 return NULL;
265} 275}
266 276
267static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) 277static struct dlm_node_addr *find_node_addr(int nodeid)
278{
279 struct dlm_node_addr *na;
280
281 list_for_each_entry(na, &dlm_node_addrs, list) {
282 if (na->nodeid == nodeid)
283 return na;
284 }
285 return NULL;
286}
287
288static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
289{
290 switch (x->ss_family) {
291 case AF_INET: {
292 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
293 struct sockaddr_in *siny = (struct sockaddr_in *)y;
294 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
295 return 0;
296 if (sinx->sin_port != siny->sin_port)
297 return 0;
298 break;
299 }
300 case AF_INET6: {
301 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
302 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
303 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
304 return 0;
305 if (sinx->sin6_port != siny->sin6_port)
306 return 0;
307 break;
308 }
309 default:
310 return 0;
311 }
312 return 1;
313}
314
315static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
316 struct sockaddr *sa_out)
268{ 317{
269 struct sockaddr_storage addr; 318 struct sockaddr_storage sas;
270 int error; 319 struct dlm_node_addr *na;
271 320
272 if (!dlm_local_count) 321 if (!dlm_local_count)
273 return -1; 322 return -1;
274 323
275 error = dlm_nodeid_to_addr(nodeid, &addr); 324 spin_lock(&dlm_node_addrs_spin);
276 if (error) 325 na = find_node_addr(nodeid);
277 return error; 326 if (na && na->addr_count)
327 memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
328 spin_unlock(&dlm_node_addrs_spin);
329
330 if (!na)
331 return -EEXIST;
332
333 if (!na->addr_count)
334 return -ENOENT;
335
336 if (sas_out)
337 memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
338
339 if (!sa_out)
340 return 0;
278 341
279 if (dlm_local_addr[0]->ss_family == AF_INET) { 342 if (dlm_local_addr[0]->ss_family == AF_INET) {
280 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; 343 struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
281 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; 344 struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
282 ret4->sin_addr.s_addr = in4->sin_addr.s_addr; 345 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
283 } else { 346 } else {
284 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; 347 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas;
285 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; 348 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out;
286 ret6->sin6_addr = in6->sin6_addr; 349 ret6->sin6_addr = in6->sin6_addr;
287 } 350 }
288 351
289 return 0; 352 return 0;
290} 353}
291 354
355static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
356{
357 struct dlm_node_addr *na;
358 int rv = -EEXIST;
359
360 spin_lock(&dlm_node_addrs_spin);
361 list_for_each_entry(na, &dlm_node_addrs, list) {
362 if (!na->addr_count)
363 continue;
364
365 if (!addr_compare(na->addr[0], addr))
366 continue;
367
368 *nodeid = na->nodeid;
369 rv = 0;
370 break;
371 }
372 spin_unlock(&dlm_node_addrs_spin);
373 return rv;
374}
375
376int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
377{
378 struct sockaddr_storage *new_addr;
379 struct dlm_node_addr *new_node, *na;
380
381 new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
382 if (!new_node)
383 return -ENOMEM;
384
385 new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
386 if (!new_addr) {
387 kfree(new_node);
388 return -ENOMEM;
389 }
390
391 memcpy(new_addr, addr, len);
392
393 spin_lock(&dlm_node_addrs_spin);
394 na = find_node_addr(nodeid);
395 if (!na) {
396 new_node->nodeid = nodeid;
397 new_node->addr[0] = new_addr;
398 new_node->addr_count = 1;
399 list_add(&new_node->list, &dlm_node_addrs);
400 spin_unlock(&dlm_node_addrs_spin);
401 return 0;
402 }
403
404 if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
405 spin_unlock(&dlm_node_addrs_spin);
406 kfree(new_addr);
407 kfree(new_node);
408 return -ENOSPC;
409 }
410
411 na->addr[na->addr_count++] = new_addr;
412 spin_unlock(&dlm_node_addrs_spin);
413 kfree(new_node);
414 return 0;
415}
416
292/* Data available on socket or listen socket received a connect */ 417/* Data available on socket or listen socket received a connect */
293static void lowcomms_data_ready(struct sock *sk, int count_unused) 418static void lowcomms_data_ready(struct sock *sk, int count_unused)
294{ 419{
@@ -348,7 +473,7 @@ int dlm_lowcomms_connect_node(int nodeid)
348} 473}
349 474
350/* Make a socket active */ 475/* Make a socket active */
351static int add_sock(struct socket *sock, struct connection *con) 476static void add_sock(struct socket *sock, struct connection *con)
352{ 477{
353 con->sock = sock; 478 con->sock = sock;
354 479
@@ -358,7 +483,6 @@ static int add_sock(struct socket *sock, struct connection *con)
358 con->sock->sk->sk_state_change = lowcomms_state_change; 483 con->sock->sk->sk_state_change = lowcomms_state_change;
359 con->sock->sk->sk_user_data = con; 484 con->sock->sk->sk_user_data = con;
360 con->sock->sk->sk_allocation = GFP_NOFS; 485 con->sock->sk->sk_allocation = GFP_NOFS;
361 return 0;
362} 486}
363 487
364/* Add the port number to an IPv6 or 4 sockaddr and return the address 488/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -510,7 +634,7 @@ static void process_sctp_notification(struct connection *con,
510 return; 634 return;
511 } 635 }
512 make_sockaddr(&prim.ssp_addr, 0, &addr_len); 636 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
513 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { 637 if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
514 unsigned char *b=(unsigned char *)&prim.ssp_addr; 638 unsigned char *b=(unsigned char *)&prim.ssp_addr;
515 log_print("reject connect from unknown addr"); 639 log_print("reject connect from unknown addr");
516 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 640 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
@@ -747,7 +871,7 @@ static int tcp_accept_from_sock(struct connection *con)
747 871
748 /* Get the new node's NODEID */ 872 /* Get the new node's NODEID */
749 make_sockaddr(&peeraddr, 0, &len); 873 make_sockaddr(&peeraddr, 0, &len);
750 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { 874 if (addr_to_nodeid(&peeraddr, &nodeid)) {
751 unsigned char *b=(unsigned char *)&peeraddr; 875 unsigned char *b=(unsigned char *)&peeraddr;
752 log_print("connect from non cluster node"); 876 log_print("connect from non cluster node");
753 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 877 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
@@ -862,7 +986,7 @@ static void sctp_init_assoc(struct connection *con)
862 if (con->retries++ > MAX_CONNECT_RETRIES) 986 if (con->retries++ > MAX_CONNECT_RETRIES)
863 return; 987 return;
864 988
865 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { 989 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
866 log_print("no address for nodeid %d", con->nodeid); 990 log_print("no address for nodeid %d", con->nodeid);
867 return; 991 return;
868 } 992 }
@@ -928,11 +1052,11 @@ static void sctp_init_assoc(struct connection *con)
928/* Connect a new socket to its peer */ 1052/* Connect a new socket to its peer */
929static void tcp_connect_to_sock(struct connection *con) 1053static void tcp_connect_to_sock(struct connection *con)
930{ 1054{
931 int result = -EHOSTUNREACH;
932 struct sockaddr_storage saddr, src_addr; 1055 struct sockaddr_storage saddr, src_addr;
933 int addr_len; 1056 int addr_len;
934 struct socket *sock = NULL; 1057 struct socket *sock = NULL;
935 int one = 1; 1058 int one = 1;
1059 int result;
936 1060
937 if (con->nodeid == 0) { 1061 if (con->nodeid == 0) {
938 log_print("attempt to connect sock 0 foiled"); 1062 log_print("attempt to connect sock 0 foiled");
@@ -944,10 +1068,8 @@ static void tcp_connect_to_sock(struct connection *con)
944 goto out; 1068 goto out;
945 1069
946 /* Some odd races can cause double-connects, ignore them */ 1070 /* Some odd races can cause double-connects, ignore them */
947 if (con->sock) { 1071 if (con->sock)
948 result = 0;
949 goto out; 1072 goto out;
950 }
951 1073
952 /* Create a socket to communicate with */ 1074 /* Create a socket to communicate with */
953 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, 1075 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
@@ -956,8 +1078,11 @@ static void tcp_connect_to_sock(struct connection *con)
956 goto out_err; 1078 goto out_err;
957 1079
958 memset(&saddr, 0, sizeof(saddr)); 1080 memset(&saddr, 0, sizeof(saddr));
959 if (dlm_nodeid_to_addr(con->nodeid, &saddr)) 1081 result = nodeid_to_addr(con->nodeid, &saddr, NULL);
1082 if (result < 0) {
1083 log_print("no address for nodeid %d", con->nodeid);
960 goto out_err; 1084 goto out_err;
1085 }
961 1086
962 sock->sk->sk_user_data = con; 1087 sock->sk->sk_user_data = con;
963 con->rx_action = receive_from_sock; 1088 con->rx_action = receive_from_sock;
@@ -983,8 +1108,7 @@ static void tcp_connect_to_sock(struct connection *con)
983 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, 1108 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
984 sizeof(one)); 1109 sizeof(one));
985 1110
986 result = 1111 result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
987 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
988 O_NONBLOCK); 1112 O_NONBLOCK);
989 if (result == -EINPROGRESS) 1113 if (result == -EINPROGRESS)
990 result = 0; 1114 result = 0;
@@ -1002,11 +1126,17 @@ out_err:
1002 * Some errors are fatal and this list might need adjusting. For other 1126 * Some errors are fatal and this list might need adjusting. For other
1003 * errors we try again until the max number of retries is reached. 1127 * errors we try again until the max number of retries is reached.
1004 */ 1128 */
1005 if (result != -EHOSTUNREACH && result != -ENETUNREACH && 1129 if (result != -EHOSTUNREACH &&
1006 result != -ENETDOWN && result != -EINVAL 1130 result != -ENETUNREACH &&
1007 && result != -EPROTONOSUPPORT) { 1131 result != -ENETDOWN &&
1132 result != -EINVAL &&
1133 result != -EPROTONOSUPPORT) {
1134 log_print("connect %d try %d error %d", con->nodeid,
1135 con->retries, result);
1136 mutex_unlock(&con->sock_mutex);
1137 msleep(1000);
1008 lowcomms_connect_sock(con); 1138 lowcomms_connect_sock(con);
1009 result = 0; 1139 return;
1010 } 1140 }
1011out: 1141out:
1012 mutex_unlock(&con->sock_mutex); 1142 mutex_unlock(&con->sock_mutex);
@@ -1044,10 +1174,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1044 if (result < 0) { 1174 if (result < 0) {
1045 log_print("Failed to set SO_REUSEADDR on socket: %d", result); 1175 log_print("Failed to set SO_REUSEADDR on socket: %d", result);
1046 } 1176 }
1047 sock->sk->sk_user_data = con;
1048 con->rx_action = tcp_accept_from_sock; 1177 con->rx_action = tcp_accept_from_sock;
1049 con->connect_action = tcp_connect_to_sock; 1178 con->connect_action = tcp_connect_to_sock;
1050 con->sock = sock;
1051 1179
1052 /* Bind to our port */ 1180 /* Bind to our port */
1053 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); 1181 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
@@ -1358,8 +1486,7 @@ static void send_to_sock(struct connection *con)
1358 } 1486 }
1359 cond_resched(); 1487 cond_resched();
1360 goto out; 1488 goto out;
1361 } 1489 } else if (ret < 0)
1362 if (ret <= 0)
1363 goto send_error; 1490 goto send_error;
1364 } 1491 }
1365 1492
@@ -1376,7 +1503,6 @@ static void send_to_sock(struct connection *con)
1376 if (e->len == 0 && e->users == 0) { 1503 if (e->len == 0 && e->users == 0) {
1377 list_del(&e->list); 1504 list_del(&e->list);
1378 free_entry(e); 1505 free_entry(e);
1379 continue;
1380 } 1506 }
1381 } 1507 }
1382 spin_unlock(&con->writequeue_lock); 1508 spin_unlock(&con->writequeue_lock);
@@ -1394,7 +1520,6 @@ out_connect:
1394 mutex_unlock(&con->sock_mutex); 1520 mutex_unlock(&con->sock_mutex);
1395 if (!test_bit(CF_INIT_PENDING, &con->flags)) 1521 if (!test_bit(CF_INIT_PENDING, &con->flags))
1396 lowcomms_connect_sock(con); 1522 lowcomms_connect_sock(con);
1397 return;
1398} 1523}
1399 1524
1400static void clean_one_writequeue(struct connection *con) 1525static void clean_one_writequeue(struct connection *con)
@@ -1414,6 +1539,7 @@ static void clean_one_writequeue(struct connection *con)
1414int dlm_lowcomms_close(int nodeid) 1539int dlm_lowcomms_close(int nodeid)
1415{ 1540{
1416 struct connection *con; 1541 struct connection *con;
1542 struct dlm_node_addr *na;
1417 1543
1418 log_print("closing connection to node %d", nodeid); 1544 log_print("closing connection to node %d", nodeid);
1419 con = nodeid2con(nodeid, 0); 1545 con = nodeid2con(nodeid, 0);
@@ -1428,6 +1554,17 @@ int dlm_lowcomms_close(int nodeid)
1428 clean_one_writequeue(con); 1554 clean_one_writequeue(con);
1429 close_connection(con, true); 1555 close_connection(con, true);
1430 } 1556 }
1557
1558 spin_lock(&dlm_node_addrs_spin);
1559 na = find_node_addr(nodeid);
1560 if (na) {
1561 list_del(&na->list);
1562 while (na->addr_count--)
1563 kfree(na->addr[na->addr_count]);
1564 kfree(na);
1565 }
1566 spin_unlock(&dlm_node_addrs_spin);
1567
1431 return 0; 1568 return 0;
1432} 1569}
1433 1570
@@ -1577,3 +1714,17 @@ fail_destroy:
1577fail: 1714fail:
1578 return error; 1715 return error;
1579} 1716}
1717
1718void dlm_lowcomms_exit(void)
1719{
1720 struct dlm_node_addr *na, *safe;
1721
1722 spin_lock(&dlm_node_addrs_spin);
1723 list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
1724 list_del(&na->list);
1725 while (na->addr_count--)
1726 kfree(na->addr[na->addr_count]);
1727 kfree(na);
1728 }
1729 spin_unlock(&dlm_node_addrs_spin);
1730}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 1311e6426287..67462e54fc2f 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -16,10 +16,12 @@
16 16
17int dlm_lowcomms_start(void); 17int dlm_lowcomms_start(void);
18void dlm_lowcomms_stop(void); 18void dlm_lowcomms_stop(void);
19void dlm_lowcomms_exit(void);
19int dlm_lowcomms_close(int nodeid); 20int dlm_lowcomms_close(int nodeid);
20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); 21void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
21void dlm_lowcomms_commit_buffer(void *mh); 22void dlm_lowcomms_commit_buffer(void *mh);
22int dlm_lowcomms_connect_node(int nodeid); 23int dlm_lowcomms_connect_node(int nodeid);
24int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
23 25
24#endif /* __LOWCOMMS_DOT_H__ */ 26#endif /* __LOWCOMMS_DOT_H__ */
25 27
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 5a59efa0bb46..079c0bd71ab7 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,6 +17,7 @@
17#include "user.h" 17#include "user.h"
18#include "memory.h" 18#include "memory.h"
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h"
20 21
21static int __init init_dlm(void) 22static int __init init_dlm(void)
22{ 23{
@@ -78,6 +79,7 @@ static void __exit exit_dlm(void)
78 dlm_config_exit(); 79 dlm_config_exit();
79 dlm_memory_exit(); 80 dlm_memory_exit();
80 dlm_lockspace_exit(); 81 dlm_lockspace_exit();
82 dlm_lowcomms_exit();
81 dlm_unregister_debugfs(); 83 dlm_unregister_debugfs();
82} 84}
83 85
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 862640a36d5c..476557b54921 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -616,13 +616,13 @@ int dlm_ls_stop(struct dlm_ls *ls)
616 down_write(&ls->ls_recv_active); 616 down_write(&ls->ls_recv_active);
617 617
618 /* 618 /*
619 * Abort any recovery that's in progress (see RECOVERY_STOP, 619 * Abort any recovery that's in progress (see RECOVER_STOP,
620 * dlm_recovery_stopped()) and tell any other threads running in the 620 * dlm_recovery_stopped()) and tell any other threads running in the
621 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). 621 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
622 */ 622 */
623 623
624 spin_lock(&ls->ls_recover_lock); 624 spin_lock(&ls->ls_recover_lock);
625 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 625 set_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
626 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); 626 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
627 ls->ls_recover_seq++; 627 ls->ls_recover_seq++;
628 spin_unlock(&ls->ls_recover_lock); 628 spin_unlock(&ls->ls_recover_lock);
@@ -642,12 +642,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
642 * when recovery is complete. 642 * when recovery is complete.
643 */ 643 */
644 644
645 if (new) 645 if (new) {
646 down_write(&ls->ls_in_recovery); 646 set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags);
647 wake_up_process(ls->ls_recoverd_task);
648 wait_event(ls->ls_recover_lock_wait,
649 test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
650 }
647 651
648 /* 652 /*
649 * The recoverd suspend/resume makes sure that dlm_recoverd (if 653 * The recoverd suspend/resume makes sure that dlm_recoverd (if
650 * running) has noticed RECOVERY_STOP above and quit processing the 654 * running) has noticed RECOVER_STOP above and quit processing the
651 * previous recovery. 655 * previous recovery.
652 */ 656 */
653 657
@@ -709,7 +713,8 @@ int dlm_ls_start(struct dlm_ls *ls)
709 kfree(rv_old); 713 kfree(rv_old);
710 } 714 }
711 715
712 dlm_recoverd_kick(ls); 716 set_bit(LSFL_RECOVER_WORK, &ls->ls_flags);
717 wake_up_process(ls->ls_recoverd_task);
713 return 0; 718 return 0;
714 719
715 fail: 720 fail:
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ef17e0169da1..60a327863b11 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -14,7 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15 15
16static uint32_t dlm_nl_seqnum; 16static uint32_t dlm_nl_seqnum;
17static uint32_t listener_nlpid; 17static uint32_t listener_nlportid;
18 18
19static struct genl_family family = { 19static struct genl_family family = {
20 .id = GENL_ID_GENERATE, 20 .id = GENL_ID_GENERATE,
@@ -64,13 +64,13 @@ static int send_data(struct sk_buff *skb)
64 return rv; 64 return rv;
65 } 65 }
66 66
67 return genlmsg_unicast(&init_net, skb, listener_nlpid); 67 return genlmsg_unicast(&init_net, skb, listener_nlportid);
68} 68}
69 69
70static int user_cmd(struct sk_buff *skb, struct genl_info *info) 70static int user_cmd(struct sk_buff *skb, struct genl_info *info)
71{ 71{
72 listener_nlpid = info->snd_pid; 72 listener_nlportid = info->snd_portid;
73 printk("user_cmd nlpid %u\n", listener_nlpid); 73 printk("user_cmd nlpid %u\n", listener_nlportid);
74 return 0; 74 return 0;
75} 75}
76 76
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 87f1a56eab32..9d61947d473a 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -581,7 +581,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
581 581
582 spin_lock(&ls->ls_recover_lock); 582 spin_lock(&ls->ls_recover_lock);
583 status = ls->ls_recover_status; 583 status = ls->ls_recover_status;
584 stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 584 stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
585 seq = ls->ls_recover_seq; 585 seq = ls->ls_recover_seq;
586 spin_unlock(&ls->ls_recover_lock); 586 spin_unlock(&ls->ls_recover_lock);
587 587
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 88ce65ff021e..32f9f8926ec3 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -41,6 +41,7 @@ static int enable_locking(struct dlm_ls *ls, uint64_t seq)
41 set_bit(LSFL_RUNNING, &ls->ls_flags); 41 set_bit(LSFL_RUNNING, &ls->ls_flags);
42 /* unblocks processes waiting to enter the dlm */ 42 /* unblocks processes waiting to enter the dlm */
43 up_write(&ls->ls_in_recovery); 43 up_write(&ls->ls_in_recovery);
44 clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
44 error = 0; 45 error = 0;
45 } 46 }
46 spin_unlock(&ls->ls_recover_lock); 47 spin_unlock(&ls->ls_recover_lock);
@@ -262,7 +263,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
262 rv = ls->ls_recover_args; 263 rv = ls->ls_recover_args;
263 ls->ls_recover_args = NULL; 264 ls->ls_recover_args = NULL;
264 if (rv && ls->ls_recover_seq == rv->seq) 265 if (rv && ls->ls_recover_seq == rv->seq)
265 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 266 clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
266 spin_unlock(&ls->ls_recover_lock); 267 spin_unlock(&ls->ls_recover_lock);
267 268
268 if (rv) { 269 if (rv) {
@@ -282,26 +283,34 @@ static int dlm_recoverd(void *arg)
282 return -1; 283 return -1;
283 } 284 }
284 285
286 down_write(&ls->ls_in_recovery);
287 set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
288 wake_up(&ls->ls_recover_lock_wait);
289
285 while (!kthread_should_stop()) { 290 while (!kthread_should_stop()) {
286 set_current_state(TASK_INTERRUPTIBLE); 291 set_current_state(TASK_INTERRUPTIBLE);
287 if (!test_bit(LSFL_WORK, &ls->ls_flags)) 292 if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
293 !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
288 schedule(); 294 schedule();
289 set_current_state(TASK_RUNNING); 295 set_current_state(TASK_RUNNING);
290 296
291 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) 297 if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
298 down_write(&ls->ls_in_recovery);
299 set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
300 wake_up(&ls->ls_recover_lock_wait);
301 }
302
303 if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
292 do_ls_recovery(ls); 304 do_ls_recovery(ls);
293 } 305 }
294 306
307 if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
308 up_write(&ls->ls_in_recovery);
309
295 dlm_put_lockspace(ls); 310 dlm_put_lockspace(ls);
296 return 0; 311 return 0;
297} 312}
298 313
299void dlm_recoverd_kick(struct dlm_ls *ls)
300{
301 set_bit(LSFL_WORK, &ls->ls_flags);
302 wake_up_process(ls->ls_recoverd_task);
303}
304
305int dlm_recoverd_start(struct dlm_ls *ls) 314int dlm_recoverd_start(struct dlm_ls *ls)
306{ 315{
307 struct task_struct *p; 316 struct task_struct *p;
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
index 866657c5d69d..8856079733fa 100644
--- a/fs/dlm/recoverd.h
+++ b/fs/dlm/recoverd.h
@@ -14,7 +14,6 @@
14#ifndef __RECOVERD_DOT_H__ 14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__ 15#define __RECOVERD_DOT_H__
16 16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls); 17void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls); 18int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls); 19void dlm_recoverd_suspend(struct dlm_ls *ls);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index eb4ed9ba3098..7ff49852b0cb 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,6 +503,13 @@ static ssize_t device_write(struct file *file, const char __user *buf,
503#endif 503#endif
504 return -EINVAL; 504 return -EINVAL;
505 505
506#ifdef CONFIG_COMPAT
507 if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
508#else
509 if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
510#endif
511 return -EINVAL;
512
506 kbuf = kzalloc(count + 1, GFP_NOFS); 513 kbuf = kzalloc(count + 1, GFP_NOFS);
507 if (!kbuf) 514 if (!kbuf)
508 return -ENOMEM; 515 return -ENOMEM;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 44ce5c6a541d..d45ba4568128 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -275,8 +275,14 @@ out:
275 275
276static int ecryptfs_flush(struct file *file, fl_owner_t td) 276static int ecryptfs_flush(struct file *file, fl_owner_t td)
277{ 277{
278 return file->f_mode & FMODE_WRITE 278 struct file *lower_file = ecryptfs_file_to_lower(file);
279 ? filemap_write_and_wait(file->f_mapping) : 0; 279
280 if (lower_file->f_op && lower_file->f_op->flush) {
281 filemap_write_and_wait(file->f_mapping);
282 return lower_file->f_op->flush(lower_file, td);
283 }
284
285 return 0;
280} 286}
281 287
282static int ecryptfs_release(struct inode *inode, struct file *file) 288static int ecryptfs_release(struct inode *inode, struct file *file)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 534b129ea676..cc7709e7c508 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -619,6 +619,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
619 struct dentry *lower_old_dir_dentry; 619 struct dentry *lower_old_dir_dentry;
620 struct dentry *lower_new_dir_dentry; 620 struct dentry *lower_new_dir_dentry;
621 struct dentry *trap = NULL; 621 struct dentry *trap = NULL;
622 struct inode *target_inode;
622 623
623 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 624 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
624 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 625 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -626,6 +627,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
626 dget(lower_new_dentry); 627 dget(lower_new_dentry);
627 lower_old_dir_dentry = dget_parent(lower_old_dentry); 628 lower_old_dir_dentry = dget_parent(lower_old_dentry);
628 lower_new_dir_dentry = dget_parent(lower_new_dentry); 629 lower_new_dir_dentry = dget_parent(lower_new_dentry);
630 target_inode = new_dentry->d_inode;
629 trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); 631 trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
630 /* source should not be ancestor of target */ 632 /* source should not be ancestor of target */
631 if (trap == lower_old_dentry) { 633 if (trap == lower_old_dentry) {
@@ -641,6 +643,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
641 lower_new_dir_dentry->d_inode, lower_new_dentry); 643 lower_new_dir_dentry->d_inode, lower_new_dentry);
642 if (rc) 644 if (rc)
643 goto out_lock; 645 goto out_lock;
646 if (target_inode)
647 fsstack_copy_attr_all(target_inode,
648 ecryptfs_inode_to_lower(target_inode));
644 fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); 649 fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
645 if (new_dir != old_dir) 650 if (new_dir != old_dir)
646 fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); 651 fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 2768138eefee..4e0886c9e5c4 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -162,6 +162,7 @@ void ecryptfs_put_lower_file(struct inode *inode)
162 inode_info = ecryptfs_inode_to_private(inode); 162 inode_info = ecryptfs_inode_to_private(inode);
163 if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, 163 if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count,
164 &inode_info->lower_file_mutex)) { 164 &inode_info->lower_file_mutex)) {
165 filemap_write_and_wait(inode->i_mapping);
165 fput(inode_info->lower_file); 166 fput(inode_info->lower_file);
166 inode_info->lower_file = NULL; 167 inode_info->lower_file = NULL;
167 mutex_unlock(&inode_info->lower_file_mutex); 168 mutex_unlock(&inode_info->lower_file_mutex);
@@ -544,11 +545,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
544 goto out_free; 545 goto out_free;
545 } 546 }
546 547
547 if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { 548 if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) {
548 rc = -EPERM; 549 rc = -EPERM;
549 printk(KERN_ERR "Mount of device (uid: %d) not owned by " 550 printk(KERN_ERR "Mount of device (uid: %d) not owned by "
550 "requested user (uid: %d)\n", 551 "requested user (uid: %d)\n",
551 path.dentry->d_inode->i_uid, current_uid()); 552 i_uid_read(path.dentry->d_inode),
553 from_kuid(&init_user_ns, current_uid()));
552 goto out_free; 554 goto out_free;
553 } 555 }
554 556
@@ -709,6 +711,12 @@ static void ecryptfs_free_kmem_caches(void)
709{ 711{
710 int i; 712 int i;
711 713
714 /*
715 * Make sure all delayed rcu free inodes are flushed before we
716 * destroy cache.
717 */
718 rcu_barrier();
719
712 for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { 720 for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
713 struct ecryptfs_cache_info *info; 721 struct ecryptfs_cache_info *info;
714 722
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index b29bb8bfa8d9..5fa2471796c2 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -33,7 +33,7 @@ static struct hlist_head *ecryptfs_daemon_hash;
33struct mutex ecryptfs_daemon_hash_mux; 33struct mutex ecryptfs_daemon_hash_mux;
34static int ecryptfs_hash_bits; 34static int ecryptfs_hash_bits;
35#define ecryptfs_current_euid_hash(uid) \ 35#define ecryptfs_current_euid_hash(uid) \
36 hash_long((unsigned long)current_euid(), ecryptfs_hash_bits) 36 hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits)
37 37
38static u32 ecryptfs_msg_counter; 38static u32 ecryptfs_msg_counter;
39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; 39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -121,8 +121,7 @@ int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
121 hlist_for_each_entry(*daemon, elem, 121 hlist_for_each_entry(*daemon, elem,
122 &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], 122 &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
123 euid_chain) { 123 euid_chain) {
124 if ((*daemon)->file->f_cred->euid == current_euid() && 124 if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) {
125 (*daemon)->file->f_cred->user_ns == current_user_ns()) {
126 rc = 0; 125 rc = 0;
127 goto out; 126 goto out;
128 } 127 }
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index bc84f365d75c..f3913eb2c474 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -97,8 +97,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
97 97
98 inode->i_mode = be16_to_cpu(efs_inode->di_mode); 98 inode->i_mode = be16_to_cpu(efs_inode->di_mode);
99 set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); 99 set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
100 inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); 100 i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
101 inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); 101 i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
102 inode->i_size = be32_to_cpu(efs_inode->di_size); 102 inode->i_size = be32_to_cpu(efs_inode->di_size);
103 inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); 103 inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
104 inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); 104 inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e755ec746c69..2002431ef9a0 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -96,6 +96,11 @@ static int init_inodecache(void)
96 96
97static void destroy_inodecache(void) 97static void destroy_inodecache(void)
98{ 98{
99 /*
100 * Make sure all delayed rcu free inodes are flushed before we
101 * destroy cache.
102 */
103 rcu_barrier();
99 kmem_cache_destroy(efs_inode_cachep); 104 kmem_cache_destroy(efs_inode_cachep);
100} 105}
101 106
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eedec84c1809..da72250ddc1c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
347static inline int ep_op_has_event(int op) 347static inline int ep_op_has_event(int op)
348{ 348{
349 return op != EPOLL_CTL_DEL; 349 return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
350} 350}
351 351
352/* Initialize the poll safe wake up structure */ 352/* Initialize the poll safe wake up structure */
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
676 return 0; 676 return 0;
677} 677}
678 678
679/*
680 * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
681 * had no event flags set, indicating that another thread may be currently
682 * handling that item's events (in the case that EPOLLONESHOT was being
683 * used). Otherwise a zero result indicates that the item has been disabled
684 * from receiving events. A disabled item may be re-enabled via
685 * EPOLL_CTL_MOD. Must be called with "mtx" held.
686 */
687static int ep_disable(struct eventpoll *ep, struct epitem *epi)
688{
689 int result = 0;
690 unsigned long flags;
691
692 spin_lock_irqsave(&ep->lock, flags);
693 if (epi->event.events & ~EP_PRIVATE_BITS) {
694 if (ep_is_linked(&epi->rdllink))
695 list_del_init(&epi->rdllink);
696 /* Ensure ep_poll_callback will not add epi back onto ready
697 list: */
698 epi->event.events &= EP_PRIVATE_BITS;
699 }
700 else
701 result = -EBUSY;
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704 return result;
705}
706
679static void ep_free(struct eventpoll *ep) 707static void ep_free(struct eventpoll *ep)
680{ 708{
681 struct rb_node *rbp; 709 struct rb_node *rbp;
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1020 rb_insert_color(&epi->rbn, &ep->rbr); 1048 rb_insert_color(&epi->rbn, &ep->rbr);
1021} 1049}
1022 1050
1023
1024
1025#define PATH_ARR_SIZE 5 1051#define PATH_ARR_SIZE 5
1026/* 1052/*
1027 * These are the number paths of length 1 to 5, that we are allowing to emanate 1053 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1787 } else 1813 } else
1788 error = -ENOENT; 1814 error = -ENOENT;
1789 break; 1815 break;
1816 case EPOLL_CTL_DISABLE:
1817 if (epi)
1818 error = ep_disable(ep, epi);
1819 else
1820 error = -ENOENT;
1821 break;
1790 } 1822 }
1791 mutex_unlock(&ep->mtx); 1823 mutex_unlock(&ep->mtx);
1792 1824
@@ -1810,7 +1842,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1810 int, maxevents, int, timeout) 1842 int, maxevents, int, timeout)
1811{ 1843{
1812 int error; 1844 int error;
1813 struct file *file; 1845 struct fd f;
1814 struct eventpoll *ep; 1846 struct eventpoll *ep;
1815 1847
1816 /* The maximum number of event must be greater than zero */ 1848 /* The maximum number of event must be greater than zero */
@@ -1818,38 +1850,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1818 return -EINVAL; 1850 return -EINVAL;
1819 1851
1820 /* Verify that the area passed by the user is writeable */ 1852 /* Verify that the area passed by the user is writeable */
1821 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { 1853 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
1822 error = -EFAULT; 1854 return -EFAULT;
1823 goto error_return;
1824 }
1825 1855
1826 /* Get the "struct file *" for the eventpoll file */ 1856 /* Get the "struct file *" for the eventpoll file */
1827 error = -EBADF; 1857 f = fdget(epfd);
1828 file = fget(epfd); 1858 if (!f.file)
1829 if (!file) 1859 return -EBADF;
1830 goto error_return;
1831 1860
1832 /* 1861 /*
1833 * We have to check that the file structure underneath the fd 1862 * We have to check that the file structure underneath the fd
1834 * the user passed to us _is_ an eventpoll file. 1863 * the user passed to us _is_ an eventpoll file.
1835 */ 1864 */
1836 error = -EINVAL; 1865 error = -EINVAL;
1837 if (!is_file_epoll(file)) 1866 if (!is_file_epoll(f.file))
1838 goto error_fput; 1867 goto error_fput;
1839 1868
1840 /* 1869 /*
1841 * At this point it is safe to assume that the "private_data" contains 1870 * At this point it is safe to assume that the "private_data" contains
1842 * our own data structure. 1871 * our own data structure.
1843 */ 1872 */
1844 ep = file->private_data; 1873 ep = f.file->private_data;
1845 1874
1846 /* Time to fish for events ... */ 1875 /* Time to fish for events ... */
1847 error = ep_poll(ep, events, maxevents, timeout); 1876 error = ep_poll(ep, events, maxevents, timeout);
1848 1877
1849error_fput: 1878error_fput:
1850 fput(file); 1879 fdput(f);
1851error_return:
1852
1853 return error; 1880 return error;
1854} 1881}
1855 1882
diff --git a/fs/exec.c b/fs/exec.c
index 574cf4de4ec3..8b9011b67041 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,26 +59,15 @@
59#include <asm/uaccess.h> 59#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
61#include <asm/tlb.h> 61#include <asm/tlb.h>
62#include <asm/exec.h>
63 62
64#include <trace/events/task.h> 63#include <trace/events/task.h>
65#include "internal.h" 64#include "internal.h"
65#include "coredump.h"
66 66
67#include <trace/events/sched.h> 67#include <trace/events/sched.h>
68 68
69int core_uses_pid;
70char core_pattern[CORENAME_MAX_SIZE] = "core";
71unsigned int core_pipe_limit;
72int suid_dumpable = 0; 69int suid_dumpable = 0;
73 70
74struct core_name {
75 char *corename;
76 int used, size;
77};
78static atomic_t call_count = ATOMIC_INIT(1);
79
80/* The maximal length of core_pattern is also specified in sysctl.c */
81
82static LIST_HEAD(formats); 71static LIST_HEAD(formats);
83static DEFINE_RWLOCK(binfmt_lock); 72static DEFINE_RWLOCK(binfmt_lock);
84 73
@@ -116,7 +105,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
116SYSCALL_DEFINE1(uselib, const char __user *, library) 105SYSCALL_DEFINE1(uselib, const char __user *, library)
117{ 106{
118 struct file *file; 107 struct file *file;
119 char *tmp = getname(library); 108 struct filename *tmp = getname(library);
120 int error = PTR_ERR(tmp); 109 int error = PTR_ERR(tmp);
121 static const struct open_flags uselib_flags = { 110 static const struct open_flags uselib_flags = {
122 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
@@ -402,7 +391,7 @@ struct user_arg_ptr {
402 union { 391 union {
403 const char __user *const __user *native; 392 const char __user *const __user *native;
404#ifdef CONFIG_COMPAT 393#ifdef CONFIG_COMPAT
405 compat_uptr_t __user *compat; 394 const compat_uptr_t __user *compat;
406#endif 395#endif
407 } ptr; 396 } ptr;
408}; 397};
@@ -613,7 +602,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
613 * process cleanup to remove whatever mess we made. 602 * process cleanup to remove whatever mess we made.
614 */ 603 */
615 if (length != move_page_tables(vma, old_start, 604 if (length != move_page_tables(vma, old_start,
616 vma, new_start, length)) 605 vma, new_start, length, false))
617 return -ENOMEM; 606 return -ENOMEM;
618 607
619 lru_add_drain(); 608 lru_add_drain();
@@ -762,13 +751,14 @@ struct file *open_exec(const char *name)
762{ 751{
763 struct file *file; 752 struct file *file;
764 int err; 753 int err;
754 struct filename tmp = { .name = name };
765 static const struct open_flags open_exec_flags = { 755 static const struct open_flags open_exec_flags = {
766 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
767 .acc_mode = MAY_EXEC | MAY_OPEN, 757 .acc_mode = MAY_EXEC | MAY_OPEN,
768 .intent = LOOKUP_OPEN 758 .intent = LOOKUP_OPEN
769 }; 759 };
770 760
771 file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); 761 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
772 if (IS_ERR(file)) 762 if (IS_ERR(file))
773 goto out; 763 goto out;
774 764
@@ -888,9 +878,11 @@ static int de_thread(struct task_struct *tsk)
888 sig->notify_count--; 878 sig->notify_count--;
889 879
890 while (sig->notify_count) { 880 while (sig->notify_count) {
891 __set_current_state(TASK_UNINTERRUPTIBLE); 881 __set_current_state(TASK_KILLABLE);
892 spin_unlock_irq(lock); 882 spin_unlock_irq(lock);
893 schedule(); 883 schedule();
884 if (unlikely(__fatal_signal_pending(tsk)))
885 goto killed;
894 spin_lock_irq(lock); 886 spin_lock_irq(lock);
895 } 887 }
896 spin_unlock_irq(lock); 888 spin_unlock_irq(lock);
@@ -908,9 +900,11 @@ static int de_thread(struct task_struct *tsk)
908 write_lock_irq(&tasklist_lock); 900 write_lock_irq(&tasklist_lock);
909 if (likely(leader->exit_state)) 901 if (likely(leader->exit_state))
910 break; 902 break;
911 __set_current_state(TASK_UNINTERRUPTIBLE); 903 __set_current_state(TASK_KILLABLE);
912 write_unlock_irq(&tasklist_lock); 904 write_unlock_irq(&tasklist_lock);
913 schedule(); 905 schedule();
906 if (unlikely(__fatal_signal_pending(tsk)))
907 goto killed;
914 } 908 }
915 909
916 /* 910 /*
@@ -1004,40 +998,14 @@ no_thread_group:
1004 998
1005 BUG_ON(!thread_group_leader(tsk)); 999 BUG_ON(!thread_group_leader(tsk));
1006 return 0; 1000 return 0;
1007}
1008
1009/*
1010 * These functions flushes out all traces of the currently running executable
1011 * so that a new one can be started
1012 */
1013static void flush_old_files(struct files_struct * files)
1014{
1015 long j = -1;
1016 struct fdtable *fdt;
1017
1018 spin_lock(&files->file_lock);
1019 for (;;) {
1020 unsigned long set, i;
1021 1001
1022 j++; 1002killed:
1023 i = j * BITS_PER_LONG; 1003 /* protects against exit_notify() and __exit_signal() */
1024 fdt = files_fdtable(files); 1004 read_lock(&tasklist_lock);
1025 if (i >= fdt->max_fds) 1005 sig->group_exit_task = NULL;
1026 break; 1006 sig->notify_count = 0;
1027 set = fdt->close_on_exec[j]; 1007 read_unlock(&tasklist_lock);
1028 if (!set) 1008 return -EAGAIN;
1029 continue;
1030 fdt->close_on_exec[j] = 0;
1031 spin_unlock(&files->file_lock);
1032 for ( ; set ; i++,set >>= 1) {
1033 if (set & 1) {
1034 sys_close(i);
1035 }
1036 }
1037 spin_lock(&files->file_lock);
1038
1039 }
1040 spin_unlock(&files->file_lock);
1041} 1009}
1042 1010
1043char *get_task_comm(char *buf, struct task_struct *tsk) 1011char *get_task_comm(char *buf, struct task_struct *tsk)
@@ -1050,6 +1018,11 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
1050} 1018}
1051EXPORT_SYMBOL_GPL(get_task_comm); 1019EXPORT_SYMBOL_GPL(get_task_comm);
1052 1020
1021/*
1022 * These functions flushes out all traces of the currently running executable
1023 * so that a new one can be started
1024 */
1025
1053void set_task_comm(struct task_struct *tsk, char *buf) 1026void set_task_comm(struct task_struct *tsk, char *buf)
1054{ 1027{
1055 task_lock(tsk); 1028 task_lock(tsk);
@@ -1136,7 +1109,7 @@ void setup_new_exec(struct linux_binprm * bprm)
1136 current->sas_ss_sp = current->sas_ss_size = 0; 1109 current->sas_ss_sp = current->sas_ss_size = 0;
1137 1110
1138 if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) 1111 if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
1139 set_dumpable(current->mm, 1); 1112 set_dumpable(current->mm, SUID_DUMPABLE_ENABLED);
1140 else 1113 else
1141 set_dumpable(current->mm, suid_dumpable); 1114 set_dumpable(current->mm, suid_dumpable);
1142 1115
@@ -1171,7 +1144,7 @@ void setup_new_exec(struct linux_binprm * bprm)
1171 current->self_exec_id++; 1144 current->self_exec_id++;
1172 1145
1173 flush_signal_handlers(current, 0); 1146 flush_signal_handlers(current, 0);
1174 flush_old_files(current->files); 1147 do_close_on_exec(current->files);
1175} 1148}
1176EXPORT_SYMBOL(setup_new_exec); 1149EXPORT_SYMBOL(setup_new_exec);
1177 1150
@@ -1601,9 +1574,9 @@ int do_execve(const char *filename,
1601} 1574}
1602 1575
1603#ifdef CONFIG_COMPAT 1576#ifdef CONFIG_COMPAT
1604int compat_do_execve(char *filename, 1577int compat_do_execve(const char *filename,
1605 compat_uptr_t __user *__argv, 1578 const compat_uptr_t __user *__argv,
1606 compat_uptr_t __user *__envp, 1579 const compat_uptr_t __user *__envp,
1607 struct pt_regs *regs) 1580 struct pt_regs *regs)
1608{ 1581{
1609 struct user_arg_ptr argv = { 1582 struct user_arg_ptr argv = {
@@ -1632,353 +1605,6 @@ void set_binfmt(struct linux_binfmt *new)
1632 1605
1633EXPORT_SYMBOL(set_binfmt); 1606EXPORT_SYMBOL(set_binfmt);
1634 1607
1635static int expand_corename(struct core_name *cn)
1636{
1637 char *old_corename = cn->corename;
1638
1639 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1640 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1641
1642 if (!cn->corename) {
1643 kfree(old_corename);
1644 return -ENOMEM;
1645 }
1646
1647 return 0;
1648}
1649
1650static int cn_printf(struct core_name *cn, const char *fmt, ...)
1651{
1652 char *cur;
1653 int need;
1654 int ret;
1655 va_list arg;
1656
1657 va_start(arg, fmt);
1658 need = vsnprintf(NULL, 0, fmt, arg);
1659 va_end(arg);
1660
1661 if (likely(need < cn->size - cn->used - 1))
1662 goto out_printf;
1663
1664 ret = expand_corename(cn);
1665 if (ret)
1666 goto expand_fail;
1667
1668out_printf:
1669 cur = cn->corename + cn->used;
1670 va_start(arg, fmt);
1671 vsnprintf(cur, need + 1, fmt, arg);
1672 va_end(arg);
1673 cn->used += need;
1674 return 0;
1675
1676expand_fail:
1677 return ret;
1678}
1679
1680static void cn_escape(char *str)
1681{
1682 for (; *str; str++)
1683 if (*str == '/')
1684 *str = '!';
1685}
1686
1687static int cn_print_exe_file(struct core_name *cn)
1688{
1689 struct file *exe_file;
1690 char *pathbuf, *path;
1691 int ret;
1692
1693 exe_file = get_mm_exe_file(current->mm);
1694 if (!exe_file) {
1695 char *commstart = cn->corename + cn->used;
1696 ret = cn_printf(cn, "%s (path unknown)", current->comm);
1697 cn_escape(commstart);
1698 return ret;
1699 }
1700
1701 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
1702 if (!pathbuf) {
1703 ret = -ENOMEM;
1704 goto put_exe_file;
1705 }
1706
1707 path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
1708 if (IS_ERR(path)) {
1709 ret = PTR_ERR(path);
1710 goto free_buf;
1711 }
1712
1713 cn_escape(path);
1714
1715 ret = cn_printf(cn, "%s", path);
1716
1717free_buf:
1718 kfree(pathbuf);
1719put_exe_file:
1720 fput(exe_file);
1721 return ret;
1722}
1723
1724/* format_corename will inspect the pattern parameter, and output a
1725 * name into corename, which must have space for at least
1726 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1727 */
1728static int format_corename(struct core_name *cn, long signr)
1729{
1730 const struct cred *cred = current_cred();
1731 const char *pat_ptr = core_pattern;
1732 int ispipe = (*pat_ptr == '|');
1733 int pid_in_pattern = 0;
1734 int err = 0;
1735
1736 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1737 cn->corename = kmalloc(cn->size, GFP_KERNEL);
1738 cn->used = 0;
1739
1740 if (!cn->corename)
1741 return -ENOMEM;
1742
1743 /* Repeat as long as we have more pattern to process and more output
1744 space */
1745 while (*pat_ptr) {
1746 if (*pat_ptr != '%') {
1747 if (*pat_ptr == 0)
1748 goto out;
1749 err = cn_printf(cn, "%c", *pat_ptr++);
1750 } else {
1751 switch (*++pat_ptr) {
1752 /* single % at the end, drop that */
1753 case 0:
1754 goto out;
1755 /* Double percent, output one percent */
1756 case '%':
1757 err = cn_printf(cn, "%c", '%');
1758 break;
1759 /* pid */
1760 case 'p':
1761 pid_in_pattern = 1;
1762 err = cn_printf(cn, "%d",
1763 task_tgid_vnr(current));
1764 break;
1765 /* uid */
1766 case 'u':
1767 err = cn_printf(cn, "%d", cred->uid);
1768 break;
1769 /* gid */
1770 case 'g':
1771 err = cn_printf(cn, "%d", cred->gid);
1772 break;
1773 /* signal that caused the coredump */
1774 case 's':
1775 err = cn_printf(cn, "%ld", signr);
1776 break;
1777 /* UNIX time of coredump */
1778 case 't': {
1779 struct timeval tv;
1780 do_gettimeofday(&tv);
1781 err = cn_printf(cn, "%lu", tv.tv_sec);
1782 break;
1783 }
1784 /* hostname */
1785 case 'h': {
1786 char *namestart = cn->corename + cn->used;
1787 down_read(&uts_sem);
1788 err = cn_printf(cn, "%s",
1789 utsname()->nodename);
1790 up_read(&uts_sem);
1791 cn_escape(namestart);
1792 break;
1793 }
1794 /* executable */
1795 case 'e': {
1796 char *commstart = cn->corename + cn->used;
1797 err = cn_printf(cn, "%s", current->comm);
1798 cn_escape(commstart);
1799 break;
1800 }
1801 case 'E':
1802 err = cn_print_exe_file(cn);
1803 break;
1804 /* core limit size */
1805 case 'c':
1806 err = cn_printf(cn, "%lu",
1807 rlimit(RLIMIT_CORE));
1808 break;
1809 default:
1810 break;
1811 }
1812 ++pat_ptr;
1813 }
1814
1815 if (err)
1816 return err;
1817 }
1818
1819 /* Backward compatibility with core_uses_pid:
1820 *
1821 * If core_pattern does not include a %p (as is the default)
1822 * and core_uses_pid is set, then .%pid will be appended to
1823 * the filename. Do not do this for piped commands. */
1824 if (!ispipe && !pid_in_pattern && core_uses_pid) {
1825 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1826 if (err)
1827 return err;
1828 }
1829out:
1830 return ispipe;
1831}
1832
1833static int zap_process(struct task_struct *start, int exit_code)
1834{
1835 struct task_struct *t;
1836 int nr = 0;
1837
1838 start->signal->flags = SIGNAL_GROUP_EXIT;
1839 start->signal->group_exit_code = exit_code;
1840 start->signal->group_stop_count = 0;
1841
1842 t = start;
1843 do {
1844 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1845 if (t != current && t->mm) {
1846 sigaddset(&t->pending.signal, SIGKILL);
1847 signal_wake_up(t, 1);
1848 nr++;
1849 }
1850 } while_each_thread(start, t);
1851
1852 return nr;
1853}
1854
1855static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1856 struct core_state *core_state, int exit_code)
1857{
1858 struct task_struct *g, *p;
1859 unsigned long flags;
1860 int nr = -EAGAIN;
1861
1862 spin_lock_irq(&tsk->sighand->siglock);
1863 if (!signal_group_exit(tsk->signal)) {
1864 mm->core_state = core_state;
1865 nr = zap_process(tsk, exit_code);
1866 }
1867 spin_unlock_irq(&tsk->sighand->siglock);
1868 if (unlikely(nr < 0))
1869 return nr;
1870
1871 if (atomic_read(&mm->mm_users) == nr + 1)
1872 goto done;
1873 /*
1874 * We should find and kill all tasks which use this mm, and we should
1875 * count them correctly into ->nr_threads. We don't take tasklist
1876 * lock, but this is safe wrt:
1877 *
1878 * fork:
1879 * None of sub-threads can fork after zap_process(leader). All
1880 * processes which were created before this point should be
1881 * visible to zap_threads() because copy_process() adds the new
1882 * process to the tail of init_task.tasks list, and lock/unlock
1883 * of ->siglock provides a memory barrier.
1884 *
1885 * do_exit:
1886 * The caller holds mm->mmap_sem. This means that the task which
1887 * uses this mm can't pass exit_mm(), so it can't exit or clear
1888 * its ->mm.
1889 *
1890 * de_thread:
1891 * It does list_replace_rcu(&leader->tasks, &current->tasks),
1892 * we must see either old or new leader, this does not matter.
1893 * However, it can change p->sighand, so lock_task_sighand(p)
1894 * must be used. Since p->mm != NULL and we hold ->mmap_sem
1895 * it can't fail.
1896 *
1897 * Note also that "g" can be the old leader with ->mm == NULL
1898 * and already unhashed and thus removed from ->thread_group.
1899 * This is OK, __unhash_process()->list_del_rcu() does not
1900 * clear the ->next pointer, we will find the new leader via
1901 * next_thread().
1902 */
1903 rcu_read_lock();
1904 for_each_process(g) {
1905 if (g == tsk->group_leader)
1906 continue;
1907 if (g->flags & PF_KTHREAD)
1908 continue;
1909 p = g;
1910 do {
1911 if (p->mm) {
1912 if (unlikely(p->mm == mm)) {
1913 lock_task_sighand(p, &flags);
1914 nr += zap_process(p, exit_code);
1915 unlock_task_sighand(p, &flags);
1916 }
1917 break;
1918 }
1919 } while_each_thread(g, p);
1920 }
1921 rcu_read_unlock();
1922done:
1923 atomic_set(&core_state->nr_threads, nr);
1924 return nr;
1925}
1926
1927static int coredump_wait(int exit_code, struct core_state *core_state)
1928{
1929 struct task_struct *tsk = current;
1930 struct mm_struct *mm = tsk->mm;
1931 int core_waiters = -EBUSY;
1932
1933 init_completion(&core_state->startup);
1934 core_state->dumper.task = tsk;
1935 core_state->dumper.next = NULL;
1936
1937 down_write(&mm->mmap_sem);
1938 if (!mm->core_state)
1939 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1940 up_write(&mm->mmap_sem);
1941
1942 if (core_waiters > 0) {
1943 struct core_thread *ptr;
1944
1945 wait_for_completion(&core_state->startup);
1946 /*
1947 * Wait for all the threads to become inactive, so that
1948 * all the thread context (extended register state, like
1949 * fpu etc) gets copied to the memory.
1950 */
1951 ptr = core_state->dumper.next;
1952 while (ptr != NULL) {
1953 wait_task_inactive(ptr->task, 0);
1954 ptr = ptr->next;
1955 }
1956 }
1957
1958 return core_waiters;
1959}
1960
1961static void coredump_finish(struct mm_struct *mm)
1962{
1963 struct core_thread *curr, *next;
1964 struct task_struct *task;
1965
1966 next = mm->core_state->dumper.next;
1967 while ((curr = next) != NULL) {
1968 next = curr->next;
1969 task = curr->task;
1970 /*
1971 * see exit_mm(), curr->task must not see
1972 * ->task == NULL before we read ->next.
1973 */
1974 smp_mb();
1975 curr->task = NULL;
1976 wake_up_process(task);
1977 }
1978
1979 mm->core_state = NULL;
1980}
1981
1982/* 1608/*
1983 * set_dumpable converts traditional three-value dumpable to two flags and 1609 * set_dumpable converts traditional three-value dumpable to two flags and
1984 * stores them into mm->flags. It modifies lower two bits of mm->flags, but 1610 * stores them into mm->flags. It modifies lower two bits of mm->flags, but
@@ -2020,7 +1646,7 @@ void set_dumpable(struct mm_struct *mm, int value)
2020 } 1646 }
2021} 1647}
2022 1648
2023static int __get_dumpable(unsigned long mm_flags) 1649int __get_dumpable(unsigned long mm_flags)
2024{ 1650{
2025 int ret; 1651 int ret;
2026 1652
@@ -2033,289 +1659,55 @@ int get_dumpable(struct mm_struct *mm)
2033 return __get_dumpable(mm->flags); 1659 return __get_dumpable(mm->flags);
2034} 1660}
2035 1661
2036static void wait_for_dump_helpers(struct file *file) 1662#ifdef __ARCH_WANT_SYS_EXECVE
1663SYSCALL_DEFINE3(execve,
1664 const char __user *, filename,
1665 const char __user *const __user *, argv,
1666 const char __user *const __user *, envp)
2037{ 1667{
2038 struct pipe_inode_info *pipe; 1668 struct filename *path = getname(filename);
2039 1669 int error = PTR_ERR(path);
2040 pipe = file->f_path.dentry->d_inode->i_pipe; 1670 if (!IS_ERR(path)) {
2041 1671 error = do_execve(path->name, argv, envp, current_pt_regs());
2042 pipe_lock(pipe); 1672 putname(path);
2043 pipe->readers++;
2044 pipe->writers--;
2045
2046 while ((pipe->readers > 1) && (!signal_pending(current))) {
2047 wake_up_interruptible_sync(&pipe->wait);
2048 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
2049 pipe_wait(pipe);
2050 } 1673 }
2051 1674 return error;
2052 pipe->readers--;
2053 pipe->writers++;
2054 pipe_unlock(pipe);
2055
2056} 1675}
2057 1676#ifdef CONFIG_COMPAT
2058 1677asmlinkage long compat_sys_execve(const char __user * filename,
2059/* 1678 const compat_uptr_t __user * argv,
2060 * umh_pipe_setup 1679 const compat_uptr_t __user * envp)
2061 * helper function to customize the process used 1680{
2062 * to collect the core in userspace. Specifically 1681 struct filename *path = getname(filename);
2063 * it sets up a pipe and installs it as fd 0 (stdin) 1682 int error = PTR_ERR(path);
2064 * for the process. Returns 0 on success, or 1683 if (!IS_ERR(path)) {
2065 * PTR_ERR on failure. 1684 error = compat_do_execve(path->name, argv, envp,
2066 * Note that it also sets the core limit to 1. This 1685 current_pt_regs());
2067 * is a special value that we use to trap recursive 1686 putname(path);
2068 * core dumps 1687 }
2069 */ 1688 return error;
2070static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
2071{
2072 struct file *files[2];
2073 struct fdtable *fdt;
2074 struct coredump_params *cp = (struct coredump_params *)info->data;
2075 struct files_struct *cf = current->files;
2076 int err = create_pipe_files(files, 0);
2077 if (err)
2078 return err;
2079
2080 cp->file = files[1];
2081
2082 sys_close(0);
2083 fd_install(0, files[0]);
2084 spin_lock(&cf->file_lock);
2085 fdt = files_fdtable(cf);
2086 __set_open_fd(0, fdt);
2087 __clear_close_on_exec(0, fdt);
2088 spin_unlock(&cf->file_lock);
2089
2090 /* and disallow core files too */
2091 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
2092
2093 return 0;
2094} 1689}
1690#endif
1691#endif
2095 1692
2096void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1693#ifdef __ARCH_WANT_KERNEL_EXECVE
1694int kernel_execve(const char *filename,
1695 const char *const argv[],
1696 const char *const envp[])
2097{ 1697{
2098 struct core_state core_state; 1698 struct pt_regs *p = current_pt_regs();
2099 struct core_name cn; 1699 int ret;
2100 struct mm_struct *mm = current->mm;
2101 struct linux_binfmt * binfmt;
2102 const struct cred *old_cred;
2103 struct cred *cred;
2104 int retval = 0;
2105 int flag = 0;
2106 int ispipe;
2107 bool need_nonrelative = false;
2108 static atomic_t core_dump_count = ATOMIC_INIT(0);
2109 struct coredump_params cprm = {
2110 .signr = signr,
2111 .regs = regs,
2112 .limit = rlimit(RLIMIT_CORE),
2113 /*
2114 * We must use the same mm->flags while dumping core to avoid
2115 * inconsistency of bit flags, since this flag is not protected
2116 * by any locks.
2117 */
2118 .mm_flags = mm->flags,
2119 };
2120
2121 audit_core_dumps(signr);
2122
2123 binfmt = mm->binfmt;
2124 if (!binfmt || !binfmt->core_dump)
2125 goto fail;
2126 if (!__get_dumpable(cprm.mm_flags))
2127 goto fail;
2128
2129 cred = prepare_creds();
2130 if (!cred)
2131 goto fail;
2132 /*
2133 * We cannot trust fsuid as being the "true" uid of the process
2134 * nor do we know its entire history. We only know it was tainted
2135 * so we dump it as root in mode 2, and only into a controlled
2136 * environment (pipe handler or fully qualified path).
2137 */
2138 if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
2139 /* Setuid core dump mode */
2140 flag = O_EXCL; /* Stop rewrite attacks */
2141 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
2142 need_nonrelative = true;
2143 }
2144
2145 retval = coredump_wait(exit_code, &core_state);
2146 if (retval < 0)
2147 goto fail_creds;
2148 1700
2149 old_cred = override_creds(cred); 1701 ret = do_execve(filename,
1702 (const char __user *const __user *)argv,
1703 (const char __user *const __user *)envp, p);
1704 if (ret < 0)
1705 return ret;
2150 1706
2151 /* 1707 /*
2152 * Clear any false indication of pending signals that might 1708 * We were successful. We won't be returning to our caller, but
2153 * be seen by the filesystem code called to write the core file. 1709 * instead to user space by manipulating the kernel stack.
2154 */ 1710 */
2155 clear_thread_flag(TIF_SIGPENDING); 1711 ret_from_kernel_execve(p);
2156
2157 ispipe = format_corename(&cn, signr);
2158
2159 if (ispipe) {
2160 int dump_count;
2161 char **helper_argv;
2162
2163 if (ispipe < 0) {
2164 printk(KERN_WARNING "format_corename failed\n");
2165 printk(KERN_WARNING "Aborting core\n");
2166 goto fail_corename;
2167 }
2168
2169 if (cprm.limit == 1) {
2170 /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
2171 *
2172 * Normally core limits are irrelevant to pipes, since
2173 * we're not writing to the file system, but we use
2174 * cprm.limit of 1 here as a speacial value, this is a
2175 * consistent way to catch recursive crashes.
2176 * We can still crash if the core_pattern binary sets
2177 * RLIM_CORE = !1, but it runs as root, and can do
2178 * lots of stupid things.
2179 *
2180 * Note that we use task_tgid_vnr here to grab the pid
2181 * of the process group leader. That way we get the
2182 * right pid if a thread in a multi-threaded
2183 * core_pattern process dies.
2184 */
2185 printk(KERN_WARNING
2186 "Process %d(%s) has RLIMIT_CORE set to 1\n",
2187 task_tgid_vnr(current), current->comm);
2188 printk(KERN_WARNING "Aborting core\n");
2189 goto fail_unlock;
2190 }
2191 cprm.limit = RLIM_INFINITY;
2192
2193 dump_count = atomic_inc_return(&core_dump_count);
2194 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
2195 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
2196 task_tgid_vnr(current), current->comm);
2197 printk(KERN_WARNING "Skipping core dump\n");
2198 goto fail_dropcount;
2199 }
2200
2201 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
2202 if (!helper_argv) {
2203 printk(KERN_WARNING "%s failed to allocate memory\n",
2204 __func__);
2205 goto fail_dropcount;
2206 }
2207
2208 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
2209 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
2210 NULL, &cprm);
2211 argv_free(helper_argv);
2212 if (retval) {
2213 printk(KERN_INFO "Core dump to %s pipe failed\n",
2214 cn.corename);
2215 goto close_fail;
2216 }
2217 } else {
2218 struct inode *inode;
2219
2220 if (cprm.limit < binfmt->min_coredump)
2221 goto fail_unlock;
2222
2223 if (need_nonrelative && cn.corename[0] != '/') {
2224 printk(KERN_WARNING "Pid %d(%s) can only dump core "\
2225 "to fully qualified path!\n",
2226 task_tgid_vnr(current), current->comm);
2227 printk(KERN_WARNING "Skipping core dump\n");
2228 goto fail_unlock;
2229 }
2230
2231 cprm.file = filp_open(cn.corename,
2232 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
2233 0600);
2234 if (IS_ERR(cprm.file))
2235 goto fail_unlock;
2236
2237 inode = cprm.file->f_path.dentry->d_inode;
2238 if (inode->i_nlink > 1)
2239 goto close_fail;
2240 if (d_unhashed(cprm.file->f_path.dentry))
2241 goto close_fail;
2242 /*
2243 * AK: actually i see no reason to not allow this for named
2244 * pipes etc, but keep the previous behaviour for now.
2245 */
2246 if (!S_ISREG(inode->i_mode))
2247 goto close_fail;
2248 /*
2249 * Dont allow local users get cute and trick others to coredump
2250 * into their pre-created files.
2251 */
2252 if (!uid_eq(inode->i_uid, current_fsuid()))
2253 goto close_fail;
2254 if (!cprm.file->f_op || !cprm.file->f_op->write)
2255 goto close_fail;
2256 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
2257 goto close_fail;
2258 }
2259
2260 retval = binfmt->core_dump(&cprm);
2261 if (retval)
2262 current->signal->group_exit_code |= 0x80;
2263
2264 if (ispipe && core_pipe_limit)
2265 wait_for_dump_helpers(cprm.file);
2266close_fail:
2267 if (cprm.file)
2268 filp_close(cprm.file, NULL);
2269fail_dropcount:
2270 if (ispipe)
2271 atomic_dec(&core_dump_count);
2272fail_unlock:
2273 kfree(cn.corename);
2274fail_corename:
2275 coredump_finish(mm);
2276 revert_creds(old_cred);
2277fail_creds:
2278 put_cred(cred);
2279fail:
2280 return;
2281}
2282
2283/*
2284 * Core dumping helper functions. These are the only things you should
2285 * do on a core-file: use only these functions to write out all the
2286 * necessary info.
2287 */
2288int dump_write(struct file *file, const void *addr, int nr)
2289{
2290 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2291} 1712}
2292EXPORT_SYMBOL(dump_write); 1713#endif
2293
2294int dump_seek(struct file *file, loff_t off)
2295{
2296 int ret = 1;
2297
2298 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2299 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2300 return 0;
2301 } else {
2302 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2303
2304 if (!buf)
2305 return 0;
2306 while (off > 0) {
2307 unsigned long n = off;
2308
2309 if (n > PAGE_SIZE)
2310 n = PAGE_SIZE;
2311 if (!dump_write(file, buf, n)) {
2312 ret = 0;
2313 break;
2314 }
2315 off -= n;
2316 }
2317 free_page((unsigned long)buf);
2318 }
2319 return ret;
2320}
2321EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 1562c27a2fab..b56181047751 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1172,8 +1172,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1172 1172
1173 /* copy stuff from on-disk struct to in-memory struct */ 1173 /* copy stuff from on-disk struct to in-memory struct */
1174 inode->i_mode = le16_to_cpu(fcb.i_mode); 1174 inode->i_mode = le16_to_cpu(fcb.i_mode);
1175 inode->i_uid = le32_to_cpu(fcb.i_uid); 1175 i_uid_write(inode, le32_to_cpu(fcb.i_uid));
1176 inode->i_gid = le32_to_cpu(fcb.i_gid); 1176 i_gid_write(inode, le32_to_cpu(fcb.i_gid));
1177 set_nlink(inode, le16_to_cpu(fcb.i_links_count)); 1177 set_nlink(inode, le16_to_cpu(fcb.i_links_count));
1178 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); 1178 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
1179 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); 1179 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
@@ -1385,8 +1385,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1385 fcb = &args->fcb; 1385 fcb = &args->fcb;
1386 1386
1387 fcb->i_mode = cpu_to_le16(inode->i_mode); 1387 fcb->i_mode = cpu_to_le16(inode->i_mode);
1388 fcb->i_uid = cpu_to_le32(inode->i_uid); 1388 fcb->i_uid = cpu_to_le32(i_uid_read(inode));
1389 fcb->i_gid = cpu_to_le32(inode->i_gid); 1389 fcb->i_gid = cpu_to_le32(i_gid_read(inode));
1390 fcb->i_links_count = cpu_to_le16(inode->i_nlink); 1390 fcb->i_links_count = cpu_to_le16(inode->i_nlink);
1391 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 1391 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1392 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 1392 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1585db1aa365..f936cb50dc0d 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
814 struct bio *bio; 814 struct bio *bio;
815 815
816 if (per_dev != master_dev) { 816 if (per_dev != master_dev) {
817 bio = bio_kmalloc(GFP_KERNEL, 817 bio = bio_clone_kmalloc(master_dev->bio,
818 master_dev->bio->bi_max_vecs); 818 GFP_KERNEL);
819 if (unlikely(!bio)) { 819 if (unlikely(!bio)) {
820 ORE_DBGMSG( 820 ORE_DBGMSG(
821 "Failed to allocate BIO size=%u\n", 821 "Failed to allocate BIO size=%u\n",
@@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
824 goto out; 824 goto out;
825 } 825 }
826 826
827 __bio_clone(bio, master_dev->bio);
828 bio->bi_bdev = NULL; 827 bio->bi_bdev = NULL;
829 bio->bi_next = NULL; 828 bio->bi_next = NULL;
830 per_dev->offset = master_dev->offset; 829 per_dev->offset = master_dev->offset;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 5f376d14fdcc..b963f38ac298 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
203 203
204static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) 204static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
205{ 205{
206 unsigned p; 206 int p;
207 207
208 for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { 208 for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
209 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 209 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index dde41a75c7c8..5e59280d42d7 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
206 */ 206 */
207static void destroy_inodecache(void) 207static void destroy_inodecache(void)
208{ 208{
209 /*
210 * Make sure all delayed rcu free inodes are flushed before we
211 * destroy cache.
212 */
213 rcu_barrier();
209 kmem_cache_destroy(exofs_inode_cachep); 214 kmem_cache_destroy(exofs_inode_cachep);
210} 215}
211 216
@@ -384,8 +389,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
384 if (unlikely(ret)) 389 if (unlikely(ret))
385 goto out; 390 goto out;
386 391
387 lock_super(sb);
388
389 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); 392 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
390 memset(fscb, 0, ios->length); 393 memset(fscb, 0, ios->length);
391 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 394 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -401,8 +404,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
401 if (unlikely(ret)) 404 if (unlikely(ret))
402 EXOFS_ERR("%s: ore_write failed.\n", __func__); 405 EXOFS_ERR("%s: ore_write failed.\n", __func__);
403 406
404
405 unlock_super(sb);
406out: 407out:
407 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 408 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
408 ore_put_io_state(ios); 409 ore_put_io_state(ios);
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 5a7b691e748b..1b4f2f95fc37 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -80,8 +80,13 @@ static ssize_t uri_show(struct exofs_dev *edp, char *buf)
80 80
81static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) 81static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
82{ 82{
83 uint8_t *new_uri;
84
83 edp->urilen = strlen(buf) + 1; 85 edp->urilen = strlen(buf) + 1;
84 edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); 86 new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
87 if (new_uri == NULL)
88 return -ENOMEM;
89 edp->uri = new_uri;
85 strncpy(edp->uri, buf, edp->urilen); 90 strncpy(edp->uri, buf, edp->urilen);
86 return edp->urilen; 91 return edp->urilen;
87} 92}
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 35d6a3cfd9ff..110b6b371a4e 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -53,16 +53,23 @@ ext2_acl_from_disk(const void *value, size_t size)
53 case ACL_OTHER: 53 case ACL_OTHER:
54 value = (char *)value + 54 value = (char *)value +
55 sizeof(ext2_acl_entry_short); 55 sizeof(ext2_acl_entry_short);
56 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
57 break; 56 break;
58 57
59 case ACL_USER: 58 case ACL_USER:
59 value = (char *)value + sizeof(ext2_acl_entry);
60 if ((char *)value > end)
61 goto fail;
62 acl->a_entries[n].e_uid =
63 make_kuid(&init_user_ns,
64 le32_to_cpu(entry->e_id));
65 break;
60 case ACL_GROUP: 66 case ACL_GROUP:
61 value = (char *)value + sizeof(ext2_acl_entry); 67 value = (char *)value + sizeof(ext2_acl_entry);
62 if ((char *)value > end) 68 if ((char *)value > end)
63 goto fail; 69 goto fail;
64 acl->a_entries[n].e_id = 70 acl->a_entries[n].e_gid =
65 le32_to_cpu(entry->e_id); 71 make_kgid(&init_user_ns,
72 le32_to_cpu(entry->e_id));
66 break; 73 break;
67 74
68 default: 75 default:
@@ -96,14 +103,19 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
96 ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION); 103 ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION);
97 e = (char *)ext_acl + sizeof(ext2_acl_header); 104 e = (char *)ext_acl + sizeof(ext2_acl_header);
98 for (n=0; n < acl->a_count; n++) { 105 for (n=0; n < acl->a_count; n++) {
106 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
99 ext2_acl_entry *entry = (ext2_acl_entry *)e; 107 ext2_acl_entry *entry = (ext2_acl_entry *)e;
100 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); 108 entry->e_tag = cpu_to_le16(acl_e->e_tag);
101 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); 109 entry->e_perm = cpu_to_le16(acl_e->e_perm);
102 switch(acl->a_entries[n].e_tag) { 110 switch(acl_e->e_tag) {
103 case ACL_USER: 111 case ACL_USER:
112 entry->e_id = cpu_to_le32(
113 from_kuid(&init_user_ns, acl_e->e_uid));
114 e += sizeof(ext2_acl_entry);
115 break;
104 case ACL_GROUP: 116 case ACL_GROUP:
105 entry->e_id = 117 entry->e_id = cpu_to_le32(
106 cpu_to_le32(acl->a_entries[n].e_id); 118 from_kgid(&init_user_ns, acl_e->e_gid));
107 e += sizeof(ext2_acl_entry); 119 e += sizeof(ext2_acl_entry);
108 break; 120 break;
109 121
@@ -350,7 +362,7 @@ ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
350 return PTR_ERR(acl); 362 return PTR_ERR(acl);
351 if (acl == NULL) 363 if (acl == NULL)
352 return -ENODATA; 364 return -ENODATA;
353 error = posix_acl_to_xattr(acl, buffer, size); 365 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
354 posix_acl_release(acl); 366 posix_acl_release(acl);
355 367
356 return error; 368 return error;
@@ -371,7 +383,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
371 return -EPERM; 383 return -EPERM;
372 384
373 if (value) { 385 if (value) {
374 acl = posix_acl_from_xattr(value, size); 386 acl = posix_acl_from_xattr(&init_user_ns, value, size);
375 if (IS_ERR(acl)) 387 if (IS_ERR(acl))
376 return PTR_ERR(acl); 388 return PTR_ERR(acl);
377 else if (acl) { 389 else if (acl) {
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 376aa77f3ca7..2616d0ea5c5c 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -479,7 +479,7 @@ void ext2_discard_reservation(struct inode *inode)
479/** 479/**
480 * ext2_free_blocks() -- Free given blocks and update quota and i_blocks 480 * ext2_free_blocks() -- Free given blocks and update quota and i_blocks
481 * @inode: inode 481 * @inode: inode
482 * @block: start physcial block to free 482 * @block: start physical block to free
483 * @count: number of blocks to free 483 * @count: number of blocks to free
484 */ 484 */
485void ext2_free_blocks (struct inode * inode, unsigned long block, 485void ext2_free_blocks (struct inode * inode, unsigned long block,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index af74d9e27b71..fa04d023177e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
206 206
207static void destroy_inodecache(void) 207static void destroy_inodecache(void)
208{ 208{
209 /*
210 * Make sure all delayed rcu free inodes are flushed before we
211 * destroy cache.
212 */
213 rcu_barrier();
209 kmem_cache_destroy(ext2_inode_cachep); 214 kmem_cache_destroy(ext2_inode_cachep);
210} 215}
211 216
@@ -464,7 +469,7 @@ static int parse_options(char *options, struct super_block *sb)
464 uid = make_kuid(current_user_ns(), option); 469 uid = make_kuid(current_user_ns(), option);
465 if (!uid_valid(uid)) { 470 if (!uid_valid(uid)) {
466 ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); 471 ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
467 return -1; 472 return 0;
468 473
469 } 474 }
470 sbi->s_resuid = uid; 475 sbi->s_resuid = uid;
@@ -475,7 +480,7 @@ static int parse_options(char *options, struct super_block *sb)
475 gid = make_kgid(current_user_ns(), option); 480 gid = make_kgid(current_user_ns(), option);
476 if (!gid_valid(gid)) { 481 if (!gid_valid(gid)) {
477 ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); 482 ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
478 return -1; 483 return 0;
479 } 484 }
480 sbi->s_resgid = gid; 485 sbi->s_resgid = gid;
481 break; 486 break;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index c76832c8d192..dbb5ad59a7fc 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -48,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size)
48 case ACL_OTHER: 48 case ACL_OTHER:
49 value = (char *)value + 49 value = (char *)value +
50 sizeof(ext3_acl_entry_short); 50 sizeof(ext3_acl_entry_short);
51 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
52 break; 51 break;
53 52
54 case ACL_USER: 53 case ACL_USER:
54 value = (char *)value + sizeof(ext3_acl_entry);
55 if ((char *)value > end)
56 goto fail;
57 acl->a_entries[n].e_uid =
58 make_kuid(&init_user_ns,
59 le32_to_cpu(entry->e_id));
60 break;
55 case ACL_GROUP: 61 case ACL_GROUP:
56 value = (char *)value + sizeof(ext3_acl_entry); 62 value = (char *)value + sizeof(ext3_acl_entry);
57 if ((char *)value > end) 63 if ((char *)value > end)
58 goto fail; 64 goto fail;
59 acl->a_entries[n].e_id = 65 acl->a_entries[n].e_gid =
60 le32_to_cpu(entry->e_id); 66 make_kgid(&init_user_ns,
67 le32_to_cpu(entry->e_id));
61 break; 68 break;
62 69
63 default: 70 default:
@@ -91,14 +98,19 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
91 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); 98 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
92 e = (char *)ext_acl + sizeof(ext3_acl_header); 99 e = (char *)ext_acl + sizeof(ext3_acl_header);
93 for (n=0; n < acl->a_count; n++) { 100 for (n=0; n < acl->a_count; n++) {
101 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
94 ext3_acl_entry *entry = (ext3_acl_entry *)e; 102 ext3_acl_entry *entry = (ext3_acl_entry *)e;
95 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); 103 entry->e_tag = cpu_to_le16(acl_e->e_tag);
96 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); 104 entry->e_perm = cpu_to_le16(acl_e->e_perm);
97 switch(acl->a_entries[n].e_tag) { 105 switch(acl_e->e_tag) {
98 case ACL_USER: 106 case ACL_USER:
107 entry->e_id = cpu_to_le32(
108 from_kuid(&init_user_ns, acl_e->e_uid));
109 e += sizeof(ext3_acl_entry);
110 break;
99 case ACL_GROUP: 111 case ACL_GROUP:
100 entry->e_id = 112 entry->e_id = cpu_to_le32(
101 cpu_to_le32(acl->a_entries[n].e_id); 113 from_kgid(&init_user_ns, acl_e->e_gid));
102 e += sizeof(ext3_acl_entry); 114 e += sizeof(ext3_acl_entry);
103 break; 115 break;
104 116
@@ -369,7 +381,7 @@ ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
369 return PTR_ERR(acl); 381 return PTR_ERR(acl);
370 if (acl == NULL) 382 if (acl == NULL)
371 return -ENODATA; 383 return -ENODATA;
372 error = posix_acl_to_xattr(acl, buffer, size); 384 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
373 posix_acl_release(acl); 385 posix_acl_release(acl);
374 386
375 return error; 387 return error;
@@ -392,7 +404,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
392 return -EPERM; 404 return -EPERM;
393 405
394 if (value) { 406 if (value) {
395 acl = posix_acl_from_xattr(value, size); 407 acl = posix_acl_from_xattr(&init_user_ns, value, size);
396 if (IS_ERR(acl)) 408 if (IS_ERR(acl))
397 return PTR_ERR(acl); 409 return PTR_ERR(acl);
398 else if (acl) { 410 else if (acl) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 90d901f0486b..7320a66e958f 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -483,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode)
483 * ext3_free_blocks_sb() -- Free given blocks and update quota 483 * ext3_free_blocks_sb() -- Free given blocks and update quota
484 * @handle: handle to this transaction 484 * @handle: handle to this transaction
485 * @sb: super block 485 * @sb: super block
486 * @block: start physcial block to free 486 * @block: start physical block to free
487 * @count: number of blocks to free 487 * @count: number of blocks to free
488 * @pdquot_freed_blocks: pointer to quota 488 * @pdquot_freed_blocks: pointer to quota
489 */ 489 */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a07597307fd1..7e87e37a372a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle,
3072 struct ext3_inode_info *ei = EXT3_I(inode); 3072 struct ext3_inode_info *ei = EXT3_I(inode);
3073 struct buffer_head *bh = iloc->bh; 3073 struct buffer_head *bh = iloc->bh;
3074 int err = 0, rc, block; 3074 int err = 0, rc, block;
3075 int need_datasync = 0;
3076 __le32 disksize;
3075 uid_t i_uid; 3077 uid_t i_uid;
3076 gid_t i_gid; 3078 gid_t i_gid;
3077 3079
@@ -3113,7 +3115,11 @@ again:
3113 raw_inode->i_gid_high = 0; 3115 raw_inode->i_gid_high = 0;
3114 } 3116 }
3115 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 3117 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3116 raw_inode->i_size = cpu_to_le32(ei->i_disksize); 3118 disksize = cpu_to_le32(ei->i_disksize);
3119 if (disksize != raw_inode->i_size) {
3120 need_datasync = 1;
3121 raw_inode->i_size = disksize;
3122 }
3117 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 3123 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
3118 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 3124 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
3119 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 3125 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3129,8 +3135,11 @@ again:
3129 if (!S_ISREG(inode->i_mode)) { 3135 if (!S_ISREG(inode->i_mode)) {
3130 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 3136 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
3131 } else { 3137 } else {
3132 raw_inode->i_size_high = 3138 disksize = cpu_to_le32(ei->i_disksize >> 32);
3133 cpu_to_le32(ei->i_disksize >> 32); 3139 if (disksize != raw_inode->i_size_high) {
3140 raw_inode->i_size_high = disksize;
3141 need_datasync = 1;
3142 }
3134 if (ei->i_disksize > 0x7fffffffULL) { 3143 if (ei->i_disksize > 0x7fffffffULL) {
3135 struct super_block *sb = inode->i_sb; 3144 struct super_block *sb = inode->i_sb;
3136 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, 3145 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3183,6 +3192,8 @@ again:
3183 ext3_clear_inode_state(inode, EXT3_STATE_NEW); 3192 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3184 3193
3185 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3194 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3195 if (need_datasync)
3196 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
3186out_brelse: 3197out_brelse:
3187 brelse (bh); 3198 brelse (bh);
3188 ext3_std_error(inode->i_sb, err); 3199 ext3_std_error(inode->i_sb, err);
@@ -3196,7 +3207,7 @@ out_brelse:
3196 * 3207 *
3197 * - Within generic_file_write() for O_SYNC files. 3208 * - Within generic_file_write() for O_SYNC files.
3198 * Here, there will be no transaction running. We wait for any running 3209 * Here, there will be no transaction running. We wait for any running
3199 * trasnaction to commit. 3210 * transaction to commit.
3200 * 3211 *
3201 * - Within sys_sync(), kupdate and such. 3212 * - Within sys_sync(), kupdate and such.
3202 * We wait on commit, if tol to. 3213 * We wait on commit, if tol to.
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8f4fddac01a6..890b8947c546 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -46,8 +46,7 @@ static struct buffer_head *ext3_append(handle_t *handle,
46 46
47 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 47 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
48 48
49 bh = ext3_bread(handle, inode, *block, 1, err); 49 if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
50 if (bh) {
51 inode->i_size += inode->i_sb->s_blocksize; 50 inode->i_size += inode->i_sb->s_blocksize;
52 EXT3_I(inode)->i_disksize = inode->i_size; 51 EXT3_I(inode)->i_disksize = inode->i_size;
53 *err = ext3_journal_get_write_access(handle, bh); 52 *err = ext3_journal_get_write_access(handle, bh);
@@ -339,8 +338,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
339 u32 hash; 338 u32 hash;
340 339
341 frame->bh = NULL; 340 frame->bh = NULL;
342 if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) 341 if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
342 *err = ERR_BAD_DX_DIR;
343 goto fail; 343 goto fail;
344 }
344 root = (struct dx_root *) bh->b_data; 345 root = (struct dx_root *) bh->b_data;
345 if (root->info.hash_version != DX_HASH_TEA && 346 if (root->info.hash_version != DX_HASH_TEA &&
346 root->info.hash_version != DX_HASH_HALF_MD4 && 347 root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -436,8 +437,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
436 frame->entries = entries; 437 frame->entries = entries;
437 frame->at = at; 438 frame->at = at;
438 if (!indirect--) return frame; 439 if (!indirect--) return frame;
439 if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) 440 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
441 *err = ERR_BAD_DX_DIR;
440 goto fail2; 442 goto fail2;
443 }
441 at = entries = ((struct dx_node *) bh->b_data)->entries; 444 at = entries = ((struct dx_node *) bh->b_data)->entries;
442 if (dx_get_limit(entries) != dx_node_limit (dir)) { 445 if (dx_get_limit(entries) != dx_node_limit (dir)) {
443 ext3_warning(dir->i_sb, __func__, 446 ext3_warning(dir->i_sb, __func__,
@@ -535,8 +538,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
535 * block so no check is necessary 538 * block so no check is necessary
536 */ 539 */
537 while (num_frames--) { 540 while (num_frames--) {
538 if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 541 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
539 0, &err))) 542 0, &err)))
540 return err; /* Failure */ 543 return err; /* Failure */
541 p++; 544 p++;
542 brelse (p->bh); 545 brelse (p->bh);
@@ -559,10 +562,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
559{ 562{
560 struct buffer_head *bh; 563 struct buffer_head *bh;
561 struct ext3_dir_entry_2 *de, *top; 564 struct ext3_dir_entry_2 *de, *top;
562 int err, count = 0; 565 int err = 0, count = 0;
563 566
564 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); 567 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
565 if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) 568
569 if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
566 return err; 570 return err;
567 571
568 de = (struct ext3_dir_entry_2 *) bh->b_data; 572 de = (struct ext3_dir_entry_2 *) bh->b_data;
@@ -976,7 +980,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
976 return NULL; 980 return NULL;
977 do { 981 do {
978 block = dx_get_block(frame->at); 982 block = dx_get_block(frame->at);
979 if (!(bh = ext3_bread (NULL,dir, block, 0, err))) 983 if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
980 goto errout; 984 goto errout;
981 985
982 retval = search_dirblock(bh, dir, entry, 986 retval = search_dirblock(bh, dir, entry,
@@ -1458,9 +1462,9 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1458 } 1462 }
1459 blocks = dir->i_size >> sb->s_blocksize_bits; 1463 blocks = dir->i_size >> sb->s_blocksize_bits;
1460 for (block = 0; block < blocks; block++) { 1464 for (block = 0; block < blocks; block++) {
1461 bh = ext3_bread(handle, dir, block, 0, &retval); 1465 if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
1462 if(!bh)
1463 return retval; 1466 return retval;
1467
1464 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1468 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1465 if (retval != -ENOSPC) 1469 if (retval != -ENOSPC)
1466 return retval; 1470 return retval;
@@ -1500,7 +1504,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1500 entries = frame->entries; 1504 entries = frame->entries;
1501 at = frame->at; 1505 at = frame->at;
1502 1506
1503 if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) 1507 if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
1504 goto cleanup; 1508 goto cleanup;
1505 1509
1506 BUFFER_TRACE(bh, "get_write_access"); 1510 BUFFER_TRACE(bh, "get_write_access");
@@ -1790,8 +1794,7 @@ retry:
1790 inode->i_op = &ext3_dir_inode_operations; 1794 inode->i_op = &ext3_dir_inode_operations;
1791 inode->i_fop = &ext3_dir_operations; 1795 inode->i_fop = &ext3_dir_operations;
1792 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1796 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1793 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1797 if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
1794 if (!dir_block)
1795 goto out_clear_inode; 1798 goto out_clear_inode;
1796 1799
1797 BUFFER_TRACE(dir_block, "get_write_access"); 1800 BUFFER_TRACE(dir_block, "get_write_access");
@@ -1859,7 +1862,7 @@ static int empty_dir (struct inode * inode)
1859 1862
1860 sb = inode->i_sb; 1863 sb = inode->i_sb;
1861 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || 1864 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1862 !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { 1865 !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
1863 if (err) 1866 if (err)
1864 ext3_error(inode->i_sb, __func__, 1867 ext3_error(inode->i_sb, __func__,
1865 "error %d reading directory #%lu offset 0", 1868 "error %d reading directory #%lu offset 0",
@@ -1890,9 +1893,8 @@ static int empty_dir (struct inode * inode)
1890 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1893 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1891 err = 0; 1894 err = 0;
1892 brelse (bh); 1895 brelse (bh);
1893 bh = ext3_bread (NULL, inode, 1896 if (!(bh = ext3_dir_bread (NULL, inode,
1894 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); 1897 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
1895 if (!bh) {
1896 if (err) 1898 if (err)
1897 ext3_error(sb, __func__, 1899 ext3_error(sb, __func__,
1898 "error %d reading directory" 1900 "error %d reading directory"
@@ -2388,7 +2390,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2388 goto end_rename; 2390 goto end_rename;
2389 } 2391 }
2390 retval = -EIO; 2392 retval = -EIO;
2391 dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); 2393 dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
2392 if (!dir_bh) 2394 if (!dir_bh)
2393 goto end_rename; 2395 goto end_rename;
2394 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2396 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
index f2ce2b0065c9..46304d8c9f0a 100644
--- a/fs/ext3/namei.h
+++ b/fs/ext3/namei.h
@@ -6,3 +6,22 @@
6*/ 6*/
7 7
8extern struct dentry *ext3_get_parent(struct dentry *child); 8extern struct dentry *ext3_get_parent(struct dentry *child);
9
10static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
11 struct inode *inode,
12 int block, int create,
13 int *err)
14{
15 struct buffer_head *bh;
16
17 bh = ext3_bread(handle, inode, block, create, err);
18
19 if (!bh && !(*err)) {
20 *err = -EIO;
21 ext3_error(inode->i_sb, __func__,
22 "Directory hole detected on inode %lu\n",
23 inode->i_ino);
24 return NULL;
25 }
26 return bh;
27}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8c892e93d8e7..5366393528df 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -532,6 +532,11 @@ static int init_inodecache(void)
532 532
533static void destroy_inodecache(void) 533static void destroy_inodecache(void)
534{ 534{
535 /*
536 * Make sure all delayed rcu free inodes are flushed before we
537 * destroy cache.
538 */
539 rcu_barrier();
535 kmem_cache_destroy(ext3_inode_cachep); 540 kmem_cache_destroy(ext3_inode_cachep);
536} 541}
537 542
@@ -975,7 +980,7 @@ static int parse_options (char *options, struct super_block *sb,
975 * Initialize args struct so we know whether arg was 980 * Initialize args struct so we know whether arg was
976 * found; some options take optional arguments. 981 * found; some options take optional arguments.
977 */ 982 */
978 args[0].to = args[0].from = 0; 983 args[0].to = args[0].from = NULL;
979 token = match_token(p, tokens, args); 984 token = match_token(p, tokens, args);
980 switch (token) { 985 switch (token) {
981 case Opt_bsd_df: 986 case Opt_bsd_df:
@@ -996,7 +1001,7 @@ static int parse_options (char *options, struct super_block *sb,
996 uid = make_kuid(current_user_ns(), option); 1001 uid = make_kuid(current_user_ns(), option);
997 if (!uid_valid(uid)) { 1002 if (!uid_valid(uid)) {
998 ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); 1003 ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
999 return -1; 1004 return 0;
1000 1005
1001 } 1006 }
1002 sbi->s_resuid = uid; 1007 sbi->s_resuid = uid;
@@ -1007,7 +1012,7 @@ static int parse_options (char *options, struct super_block *sb,
1007 gid = make_kgid(current_user_ns(), option); 1012 gid = make_kgid(current_user_ns(), option);
1008 if (!gid_valid(gid)) { 1013 if (!gid_valid(gid)) {
1009 ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); 1014 ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
1010 return -1; 1015 return 0;
1011 } 1016 }
1012 sbi->s_resgid = gid; 1017 sbi->s_resgid = gid;
1013 break; 1018 break;
@@ -1479,10 +1484,12 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1479 } 1484 }
1480 1485
1481 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { 1486 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1482 if (es->s_last_orphan) 1487 /* don't clear list on RO mount w/ errors */
1488 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
1483 jbd_debug(1, "Errors on filesystem, " 1489 jbd_debug(1, "Errors on filesystem, "
1484 "clearing orphan list.\n"); 1490 "clearing orphan list.\n");
1485 es->s_last_orphan = 0; 1491 es->s_last_orphan = 0;
1492 }
1486 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 1493 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1487 return; 1494 return;
1488 } 1495 }
@@ -2571,11 +2578,9 @@ out:
2571static int ext3_unfreeze(struct super_block *sb) 2578static int ext3_unfreeze(struct super_block *sb)
2572{ 2579{
2573 if (!(sb->s_flags & MS_RDONLY)) { 2580 if (!(sb->s_flags & MS_RDONLY)) {
2574 lock_super(sb);
2575 /* Reser the needs_recovery flag before the fs is unlocked. */ 2581 /* Reser the needs_recovery flag before the fs is unlocked. */
2576 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2582 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2577 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); 2583 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2578 unlock_super(sb);
2579 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2584 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2580 } 2585 }
2581 return 0; 2586 return 0;
@@ -2595,7 +2600,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2595#endif 2600#endif
2596 2601
2597 /* Store the original options */ 2602 /* Store the original options */
2598 lock_super(sb);
2599 old_sb_flags = sb->s_flags; 2603 old_sb_flags = sb->s_flags;
2600 old_opts.s_mount_opt = sbi->s_mount_opt; 2604 old_opts.s_mount_opt = sbi->s_mount_opt;
2601 old_opts.s_resuid = sbi->s_resuid; 2605 old_opts.s_resuid = sbi->s_resuid;
@@ -2701,8 +2705,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2701 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 2705 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2702 kfree(old_opts.s_qf_names[i]); 2706 kfree(old_opts.s_qf_names[i]);
2703#endif 2707#endif
2704 unlock_super(sb);
2705
2706 if (enable_quota) 2708 if (enable_quota)
2707 dquot_resume(sb, -1); 2709 dquot_resume(sb, -1);
2708 return 0; 2710 return 0;
@@ -2721,7 +2723,6 @@ restore_opts:
2721 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2723 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2722 } 2724 }
2723#endif 2725#endif
2724 unlock_super(sb);
2725 return err; 2726 return err;
2726} 2727}
2727 2728
@@ -2803,7 +2804,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2803 2804
2804static inline struct inode *dquot_to_inode(struct dquot *dquot) 2805static inline struct inode *dquot_to_inode(struct dquot *dquot)
2805{ 2806{
2806 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; 2807 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
2807} 2808}
2808 2809
2809static int ext3_write_dquot(struct dquot *dquot) 2810static int ext3_write_dquot(struct dquot *dquot)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a5c29bb3b835..d3c5b88fd89f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -55,16 +55,23 @@ ext4_acl_from_disk(const void *value, size_t size)
55 case ACL_OTHER: 55 case ACL_OTHER:
56 value = (char *)value + 56 value = (char *)value +
57 sizeof(ext4_acl_entry_short); 57 sizeof(ext4_acl_entry_short);
58 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
59 break; 58 break;
60 59
61 case ACL_USER: 60 case ACL_USER:
61 value = (char *)value + sizeof(ext4_acl_entry);
62 if ((char *)value > end)
63 goto fail;
64 acl->a_entries[n].e_uid =
65 make_kuid(&init_user_ns,
66 le32_to_cpu(entry->e_id));
67 break;
62 case ACL_GROUP: 68 case ACL_GROUP:
63 value = (char *)value + sizeof(ext4_acl_entry); 69 value = (char *)value + sizeof(ext4_acl_entry);
64 if ((char *)value > end) 70 if ((char *)value > end)
65 goto fail; 71 goto fail;
66 acl->a_entries[n].e_id = 72 acl->a_entries[n].e_gid =
67 le32_to_cpu(entry->e_id); 73 make_kgid(&init_user_ns,
74 le32_to_cpu(entry->e_id));
68 break; 75 break;
69 76
70 default: 77 default:
@@ -98,13 +105,19 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
98 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); 105 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
99 e = (char *)ext_acl + sizeof(ext4_acl_header); 106 e = (char *)ext_acl + sizeof(ext4_acl_header);
100 for (n = 0; n < acl->a_count; n++) { 107 for (n = 0; n < acl->a_count; n++) {
108 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
101 ext4_acl_entry *entry = (ext4_acl_entry *)e; 109 ext4_acl_entry *entry = (ext4_acl_entry *)e;
102 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); 110 entry->e_tag = cpu_to_le16(acl_e->e_tag);
103 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); 111 entry->e_perm = cpu_to_le16(acl_e->e_perm);
104 switch (acl->a_entries[n].e_tag) { 112 switch (acl_e->e_tag) {
105 case ACL_USER: 113 case ACL_USER:
114 entry->e_id = cpu_to_le32(
115 from_kuid(&init_user_ns, acl_e->e_uid));
116 e += sizeof(ext4_acl_entry);
117 break;
106 case ACL_GROUP: 118 case ACL_GROUP:
107 entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); 119 entry->e_id = cpu_to_le32(
120 from_kgid(&init_user_ns, acl_e->e_gid));
108 e += sizeof(ext4_acl_entry); 121 e += sizeof(ext4_acl_entry);
109 break; 122 break;
110 123
@@ -374,7 +387,7 @@ ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
374 return PTR_ERR(acl); 387 return PTR_ERR(acl);
375 if (acl == NULL) 388 if (acl == NULL)
376 return -ENODATA; 389 return -ENODATA;
377 error = posix_acl_to_xattr(acl, buffer, size); 390 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
378 posix_acl_release(acl); 391 posix_acl_release(acl);
379 392
380 return error; 393 return error;
@@ -397,7 +410,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
397 return -EPERM; 410 return -EPERM;
398 411
399 if (value) { 412 if (value) {
400 acl = posix_acl_from_xattr(value, size); 413 acl = posix_acl_from_xattr(&init_user_ns, value, size);
401 if (IS_ERR(acl)) 414 if (IS_ERR(acl))
402 return PTR_ERR(acl); 415 return PTR_ERR(acl);
403 else if (acl) { 416 else if (acl) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4ce2da..3ab2539b7b2e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
186#define EXT4_IO_END_ERROR 0x0002 186#define EXT4_IO_END_ERROR 0x0002
187#define EXT4_IO_END_QUEUED 0x0004 187#define EXT4_IO_END_QUEUED 0x0004
188#define EXT4_IO_END_DIRECT 0x0008 188#define EXT4_IO_END_DIRECT 0x0008
189#define EXT4_IO_END_IN_FSYNC 0x0010
190 189
191struct ext4_io_page { 190struct ext4_io_page {
192 struct page *p_page; 191 struct page *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
912 struct list_head i_completed_io_list; 911 struct list_head i_completed_io_list;
913 spinlock_t i_completed_io_lock; 912 spinlock_t i_completed_io_lock;
914 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 913 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
915 /* current io_end structure for async DIO write*/ 914 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
916 ext4_io_end_t *cur_aio_dio;
917 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
918 915
919 spinlock_t i_block_reservation_lock; 916 spinlock_t i_block_reservation_lock;
920 917
@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
1233 spinlock_t s_md_lock; 1230 spinlock_t s_md_lock;
1234 unsigned short *s_mb_offsets; 1231 unsigned short *s_mb_offsets;
1235 unsigned int *s_mb_maxs; 1232 unsigned int *s_mb_maxs;
1233 unsigned int s_group_info_size;
1236 1234
1237 /* tunables */ 1235 /* tunables */
1238 unsigned long s_stripe; 1236 unsigned long s_stripe;
@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
1243 unsigned int s_mb_order2_reqs; 1241 unsigned int s_mb_order2_reqs;
1244 unsigned int s_mb_group_prealloc; 1242 unsigned int s_mb_group_prealloc;
1245 unsigned int s_max_writeback_mb_bump; 1243 unsigned int s_max_writeback_mb_bump;
1244 unsigned int s_max_dir_size_kb;
1246 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1247 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
1248 unsigned long s_mb_last_start; 1247 unsigned long s_mb_last_start;
@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
1270 unsigned long s_sectors_written_start; 1269 unsigned long s_sectors_written_start;
1271 u64 s_kbytes_written; 1270 u64 s_kbytes_written;
1272 1271
1272 /* the size of zero-out chunk */
1273 unsigned int s_extent_max_zeroout_kb;
1274
1273 unsigned int s_log_groups_per_flex; 1275 unsigned int s_log_groups_per_flex;
1274 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1277 ext4_group_t s_flex_groups_allocated;
1275 1278
1276 /* workqueue for dio unwritten */ 1279 /* workqueue for dio unwritten */
1277 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *dio_unwritten_wq;
@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1328{ 1331{
1329 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1332 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1330 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1333 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1331 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 1334 atomic_inc(&EXT4_I(inode)->i_unwritten);
1332 } 1335 }
1333} 1336}
1334 1337
1338static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
1339{
1340 return inode->i_private;
1341}
1342
1343static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
1344{
1345 inode->i_private = io;
1346}
1347
1335/* 1348/*
1336 * Inode dynamic state flags 1349 * Inode dynamic state flags
1337 */ 1350 */
@@ -1345,6 +1358,8 @@ enum {
1345 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1358 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1346 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1359 EXT4_STATE_NEWENTRY, /* File just added to dir */
1347 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1360 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1361 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1362 nolocking */
1348}; 1363};
1349 1364
1350#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1365#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1932 1947
1933/* fsync.c */ 1948/* fsync.c */
1934extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 1949extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
1935extern int ext4_flush_completed_IO(struct inode *); 1950extern int ext4_flush_unwritten_io(struct inode *);
1936 1951
1937/* hash.c */ 1952/* hash.c */
1938extern int ext4fs_dirhash(const char *name, int len, struct 1953extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
1966extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1981extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1967 struct buffer_head *bh, ext4_fsblk_t block, 1982 struct buffer_head *bh, ext4_fsblk_t block,
1968 unsigned long count, int flags); 1983 unsigned long count, int flags);
1984extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
1985 ext4_group_t ngroups);
1969extern int ext4_mb_add_groupinfo(struct super_block *sb, 1986extern int ext4_mb_add_groupinfo(struct super_block *sb,
1970 ext4_group_t i, struct ext4_group_desc *desc); 1987 ext4_group_t i, struct ext4_group_desc *desc);
1971extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 1988extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
2051extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2068extern void *ext4_kvmalloc(size_t size, gfp_t flags);
2052extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2069extern void *ext4_kvzalloc(size_t size, gfp_t flags);
2053extern void ext4_kvfree(void *ptr); 2070extern void ext4_kvfree(void *ptr);
2071extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2072 ext4_group_t ngroup);
2054extern __printf(4, 5) 2073extern __printf(4, 5)
2055void __ext4_error(struct super_block *, const char *, unsigned int, 2074void __ext4_error(struct super_block *, const char *, unsigned int,
2056 const char *, ...); 2075 const char *, ...);
@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
2352extern const struct inode_operations ext4_file_inode_operations; 2371extern const struct inode_operations ext4_file_inode_operations;
2353extern const struct file_operations ext4_file_operations; 2372extern const struct file_operations ext4_file_operations;
2354extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2373extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
2374extern void ext4_unwritten_wait(struct inode *inode);
2355 2375
2356/* namei.c */ 2376/* namei.c */
2357extern const struct inode_operations ext4_dir_inode_operations; 2377extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2400 2420
2401/* page-io.c */ 2421/* page-io.c */
2402extern int __init ext4_init_pageio(void); 2422extern int __init ext4_init_pageio(void);
2423extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2403extern void ext4_exit_pageio(void); 2424extern void ext4_exit_pageio(void);
2404extern void ext4_ioend_wait(struct inode *); 2425extern void ext4_ioend_wait(struct inode *);
2405extern void ext4_free_io_end(ext4_io_end_t *io); 2426extern void ext4_free_io_end(ext4_io_end_t *io);
2406extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2427extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2407extern int ext4_end_io_nolock(ext4_io_end_t *io);
2408extern void ext4_io_submit(struct ext4_io_submit *io); 2428extern void ext4_io_submit(struct ext4_io_submit *io);
2409extern int ext4_bio_write_page(struct ext4_io_submit *io, 2429extern int ext4_bio_write_page(struct ext4_io_submit *io,
2410 struct page *page, 2430 struct page *page,
@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2452 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 2472 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
2453} 2473}
2454 2474
2475/*
2476 * Disable DIO read nolock optimization, so new dioreaders will be forced
2477 * to grab i_mutex
2478 */
2479static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
2480{
2481 ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
2482 smp_mb();
2483}
2484static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
2485{
2486 smp_mb();
2487 ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
2488}
2489
2455#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2490#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2456 2491
2457/* For ioend & aio unwritten conversion wait queues */ 2492/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f53683..1c94cca35ed1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1177 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1177 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1178 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1178 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1179 1179
1180 neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); 1180 le16_add_cpu(&neh->eh_depth, 1);
1181 ext4_mark_inode_dirty(handle, inode); 1181 ext4_mark_inode_dirty(handle, inode);
1182out: 1182out:
1183 brelse(bh); 1183 brelse(bh);
@@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1656} 1656}
1657 1657
1658/* 1658/*
1659 * This function does a very simple check to see if we can collapse
1660 * an extent tree with a single extent tree leaf block into the inode.
1661 */
1662static void ext4_ext_try_to_merge_up(handle_t *handle,
1663 struct inode *inode,
1664 struct ext4_ext_path *path)
1665{
1666 size_t s;
1667 unsigned max_root = ext4_ext_space_root(inode, 0);
1668 ext4_fsblk_t blk;
1669
1670 if ((path[0].p_depth != 1) ||
1671 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1672 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1673 return;
1674
1675 /*
1676 * We need to modify the block allocation bitmap and the block
1677 * group descriptor to release the extent tree block. If we
1678 * can't get the journal credits, give up.
1679 */
1680 if (ext4_journal_extend(handle, 2))
1681 return;
1682
1683 /*
1684 * Copy the extent data up to the inode
1685 */
1686 blk = ext4_idx_pblock(path[0].p_idx);
1687 s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1688 sizeof(struct ext4_extent_idx);
1689 s += sizeof(struct ext4_extent_header);
1690
1691 memcpy(path[0].p_hdr, path[1].p_hdr, s);
1692 path[0].p_depth = 0;
1693 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1694 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1695 path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1696
1697 brelse(path[1].p_bh);
1698 ext4_free_blocks(handle, inode, NULL, blk, 1,
1699 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1700}
1701
1702/*
1659 * This function tries to merge the @ex extent to neighbours in the tree. 1703 * This function tries to merge the @ex extent to neighbours in the tree.
1660 * return 1 if merge left else 0. 1704 * return 1 if merge left else 0.
1661 */ 1705 */
1662static int ext4_ext_try_to_merge(struct inode *inode, 1706static void ext4_ext_try_to_merge(handle_t *handle,
1707 struct inode *inode,
1663 struct ext4_ext_path *path, 1708 struct ext4_ext_path *path,
1664 struct ext4_extent *ex) { 1709 struct ext4_extent *ex) {
1665 struct ext4_extent_header *eh; 1710 struct ext4_extent_header *eh;
1666 unsigned int depth; 1711 unsigned int depth;
1667 int merge_done = 0; 1712 int merge_done = 0;
1668 int ret = 0;
1669 1713
1670 depth = ext_depth(inode); 1714 depth = ext_depth(inode);
1671 BUG_ON(path[depth].p_hdr == NULL); 1715 BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1675 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1719 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1676 1720
1677 if (!merge_done) 1721 if (!merge_done)
1678 ret = ext4_ext_try_to_merge_right(inode, path, ex); 1722 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1679 1723
1680 return ret; 1724 ext4_ext_try_to_merge_up(handle, inode, path);
1681} 1725}
1682 1726
1683/* 1727/*
@@ -1893,7 +1937,7 @@ has_space:
1893merge: 1937merge:
1894 /* try to merge extents */ 1938 /* try to merge extents */
1895 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1939 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1896 ext4_ext_try_to_merge(inode, path, nearex); 1940 ext4_ext_try_to_merge(handle, inode, path, nearex);
1897 1941
1898 1942
1899 /* time to correct all indexes above */ 1943 /* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
1901 if (err) 1945 if (err)
1902 goto cleanup; 1946 goto cleanup;
1903 1947
1904 err = ext4_ext_dirty(handle, inode, path + depth); 1948 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
1905 1949
1906cleanup: 1950cleanup:
1907 if (npath) { 1951 if (npath) {
@@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2092} 2136}
2093 2137
2094/* 2138/*
2095 * ext4_ext_check_cache() 2139 * ext4_ext_in_cache()
2096 * Checks to see if the given block is in the cache. 2140 * Checks to see if the given block is in the cache.
2097 * If it is, the cached extent is stored in the given 2141 * If it is, the cached extent is stored in the given
2098 * cache extent pointer. If the cached extent is a hole, 2142 * cache extent pointer.
2099 * this routine should be used instead of
2100 * ext4_ext_in_cache if the calling function needs to
2101 * know the size of the hole.
2102 * 2143 *
2103 * @inode: The files inode 2144 * @inode: The files inode
2104 * @block: The block to look for in the cache 2145 * @block: The block to look for in the cache
@@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2107 * 2148 *
2108 * Return 0 if cache is invalid; 1 if the cache is valid 2149 * Return 0 if cache is invalid; 1 if the cache is valid
2109 */ 2150 */
2110static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, 2151static int
2111 struct ext4_ext_cache *ex){ 2152ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2153 struct ext4_extent *ex)
2154{
2112 struct ext4_ext_cache *cex; 2155 struct ext4_ext_cache *cex;
2113 struct ext4_sb_info *sbi; 2156 struct ext4_sb_info *sbi;
2114 int ret = 0; 2157 int ret = 0;
@@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2125 goto errout; 2168 goto errout;
2126 2169
2127 if (in_range(block, cex->ec_block, cex->ec_len)) { 2170 if (in_range(block, cex->ec_block, cex->ec_len)) {
2128 memcpy(ex, cex, sizeof(struct ext4_ext_cache)); 2171 ex->ee_block = cpu_to_le32(cex->ec_block);
2172 ext4_ext_store_pblock(ex, cex->ec_start);
2173 ex->ee_len = cpu_to_le16(cex->ec_len);
2129 ext_debug("%u cached by %u:%u:%llu\n", 2174 ext_debug("%u cached by %u:%u:%llu\n",
2130 block, 2175 block,
2131 cex->ec_block, cex->ec_len, cex->ec_start); 2176 cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2138,37 +2183,6 @@ errout:
2138} 2183}
2139 2184
2140/* 2185/*
2141 * ext4_ext_in_cache()
2142 * Checks to see if the given block is in the cache.
2143 * If it is, the cached extent is stored in the given
2144 * extent pointer.
2145 *
2146 * @inode: The files inode
2147 * @block: The block to look for in the cache
2148 * @ex: Pointer where the cached extent will be stored
2149 * if it contains block
2150 *
2151 * Return 0 if cache is invalid; 1 if the cache is valid
2152 */
2153static int
2154ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2155 struct ext4_extent *ex)
2156{
2157 struct ext4_ext_cache cex;
2158 int ret = 0;
2159
2160 if (ext4_ext_check_cache(inode, block, &cex)) {
2161 ex->ee_block = cpu_to_le32(cex.ec_block);
2162 ext4_ext_store_pblock(ex, cex.ec_start);
2163 ex->ee_len = cpu_to_le16(cex.ec_len);
2164 ret = 1;
2165 }
2166
2167 return ret;
2168}
2169
2170
2171/*
2172 * ext4_ext_rm_idx: 2186 * ext4_ext_rm_idx:
2173 * removes index from the index block. 2187 * removes index from the index block.
2174 */ 2188 */
@@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2274 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2288 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2275 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2289 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2276 ext4_fsblk_t pblk; 2290 ext4_fsblk_t pblk;
2277 int flags = EXT4_FREE_BLOCKS_FORGET; 2291 int flags = 0;
2278 2292
2279 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2293 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2280 flags |= EXT4_FREE_BLOCKS_METADATA; 2294 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2295 else if (ext4_should_journal_data(inode))
2296 flags |= EXT4_FREE_BLOCKS_FORGET;
2297
2281 /* 2298 /*
2282 * For bigalloc file systems, we never free a partial cluster 2299 * For bigalloc file systems, we never free a partial cluster
2283 * at the beginning of the extent. Instead, we make a note 2300 * at the beginning of the extent. Instead, we make a note
@@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2572 struct ext4_ext_path *path = NULL; 2589 struct ext4_ext_path *path = NULL;
2573 ext4_fsblk_t partial_cluster = 0; 2590 ext4_fsblk_t partial_cluster = 0;
2574 handle_t *handle; 2591 handle_t *handle;
2575 int i = 0, err; 2592 int i = 0, err = 0;
2576 2593
2577 ext_debug("truncate since %u to %u\n", start, end); 2594 ext_debug("truncate since %u to %u\n", start, end);
2578 2595
@@ -2604,12 +2621,16 @@ again:
2604 return PTR_ERR(path); 2621 return PTR_ERR(path);
2605 } 2622 }
2606 depth = ext_depth(inode); 2623 depth = ext_depth(inode);
2624 /* Leaf not may not exist only if inode has no blocks at all */
2607 ex = path[depth].p_ext; 2625 ex = path[depth].p_ext;
2608 if (!ex) { 2626 if (!ex) {
2609 ext4_ext_drop_refs(path); 2627 if (depth) {
2610 kfree(path); 2628 EXT4_ERROR_INODE(inode,
2611 path = NULL; 2629 "path[%d].p_hdr == NULL",
2612 goto cont; 2630 depth);
2631 err = -EIO;
2632 }
2633 goto out;
2613 } 2634 }
2614 2635
2615 ee_block = le32_to_cpu(ex->ee_block); 2636 ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2662,6 @@ again:
2641 goto out; 2662 goto out;
2642 } 2663 }
2643 } 2664 }
2644cont:
2645
2646 /* 2665 /*
2647 * We start scanning from right side, freeing all the blocks 2666 * We start scanning from right side, freeing all the blocks
2648 * after i_size and walking into the tree depth-wise. 2667 * after i_size and walking into the tree depth-wise.
@@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle,
2924 ext4_ext_mark_initialized(ex); 2943 ext4_ext_mark_initialized(ex);
2925 2944
2926 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2945 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2927 ext4_ext_try_to_merge(inode, path, ex); 2946 ext4_ext_try_to_merge(handle, inode, path, ex);
2928 2947
2929 err = ext4_ext_dirty(handle, inode, path + depth); 2948 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2930 goto out; 2949 goto out;
2931 } 2950 }
2932 2951
@@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle,
2958 goto fix_extent_len; 2977 goto fix_extent_len;
2959 /* update the extent length and mark as initialized */ 2978 /* update the extent length and mark as initialized */
2960 ex->ee_len = cpu_to_le16(ee_len); 2979 ex->ee_len = cpu_to_le16(ee_len);
2961 ext4_ext_try_to_merge(inode, path, ex); 2980 ext4_ext_try_to_merge(handle, inode, path, ex);
2962 err = ext4_ext_dirty(handle, inode, path + depth); 2981 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2963 goto out; 2982 goto out;
2964 } else if (err) 2983 } else if (err)
2965 goto fix_extent_len; 2984 goto fix_extent_len;
@@ -3041,7 +3060,6 @@ out:
3041 return err ? err : map->m_len; 3060 return err ? err : map->m_len;
3042} 3061}
3043 3062
3044#define EXT4_EXT_ZERO_LEN 7
3045/* 3063/*
3046 * This function is called by ext4_ext_map_blocks() if someone tries to write 3064 * This function is called by ext4_ext_map_blocks() if someone tries to write
3047 * to an uninitialized extent. It may result in splitting the uninitialized 3065 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3067 struct ext4_map_blocks *map, 3085 struct ext4_map_blocks *map,
3068 struct ext4_ext_path *path) 3086 struct ext4_ext_path *path)
3069{ 3087{
3088 struct ext4_sb_info *sbi;
3070 struct ext4_extent_header *eh; 3089 struct ext4_extent_header *eh;
3071 struct ext4_map_blocks split_map; 3090 struct ext4_map_blocks split_map;
3072 struct ext4_extent zero_ex; 3091 struct ext4_extent zero_ex;
3073 struct ext4_extent *ex; 3092 struct ext4_extent *ex;
3074 ext4_lblk_t ee_block, eof_block; 3093 ext4_lblk_t ee_block, eof_block;
3075 unsigned int ee_len, depth; 3094 unsigned int ee_len, depth;
3076 int allocated; 3095 int allocated, max_zeroout = 0;
3077 int err = 0; 3096 int err = 0;
3078 int split_flag = 0; 3097 int split_flag = 0;
3079 3098
@@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3081 "block %llu, max_blocks %u\n", inode->i_ino, 3100 "block %llu, max_blocks %u\n", inode->i_ino,
3082 (unsigned long long)map->m_lblk, map->m_len); 3101 (unsigned long long)map->m_lblk, map->m_len);
3083 3102
3103 sbi = EXT4_SB(inode->i_sb);
3084 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3104 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3085 inode->i_sb->s_blocksize_bits; 3105 inode->i_sb->s_blocksize_bits;
3086 if (eof_block < map->m_lblk + map->m_len) 3106 if (eof_block < map->m_lblk + map->m_len)
@@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3180 */ 3200 */
3181 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3201 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3182 3202
3183 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 3203 if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3184 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && 3204 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3185 (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3205 inode->i_sb->s_blocksize_bits;
3206
3207 /* If extent is less than s_max_zeroout_kb, zeroout directly */
3208 if (max_zeroout && (ee_len <= max_zeroout)) {
3186 err = ext4_ext_zeroout(inode, ex); 3209 err = ext4_ext_zeroout(inode, ex);
3187 if (err) 3210 if (err)
3188 goto out; 3211 goto out;
@@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3191 if (err) 3214 if (err)
3192 goto out; 3215 goto out;
3193 ext4_ext_mark_initialized(ex); 3216 ext4_ext_mark_initialized(ex);
3194 ext4_ext_try_to_merge(inode, path, ex); 3217 ext4_ext_try_to_merge(handle, inode, path, ex);
3195 err = ext4_ext_dirty(handle, inode, path + depth); 3218 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3196 goto out; 3219 goto out;
3197 } 3220 }
3198 3221
@@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3206 split_map.m_lblk = map->m_lblk; 3229 split_map.m_lblk = map->m_lblk;
3207 split_map.m_len = map->m_len; 3230 split_map.m_len = map->m_len;
3208 3231
3209 if (allocated > map->m_len) { 3232 if (max_zeroout && (allocated > map->m_len)) {
3210 if (allocated <= EXT4_EXT_ZERO_LEN && 3233 if (allocated <= max_zeroout) {
3211 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3212 /* case 3 */ 3234 /* case 3 */
3213 zero_ex.ee_block = 3235 zero_ex.ee_block =
3214 cpu_to_le32(map->m_lblk); 3236 cpu_to_le32(map->m_lblk);
@@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3220 goto out; 3242 goto out;
3221 split_map.m_lblk = map->m_lblk; 3243 split_map.m_lblk = map->m_lblk;
3222 split_map.m_len = allocated; 3244 split_map.m_len = allocated;
3223 } else if ((map->m_lblk - ee_block + map->m_len < 3245 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3224 EXT4_EXT_ZERO_LEN) &&
3225 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3226 /* case 2 */ 3246 /* case 2 */
3227 if (map->m_lblk != ee_block) { 3247 if (map->m_lblk != ee_block) {
3228 zero_ex.ee_block = ex->ee_block; 3248 zero_ex.ee_block = ex->ee_block;
@@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3242 } 3262 }
3243 3263
3244 allocated = ext4_split_extent(handle, inode, path, 3264 allocated = ext4_split_extent(handle, inode, path,
3245 &split_map, split_flag, 0); 3265 &split_map, split_flag, 0);
3246 if (allocated < 0) 3266 if (allocated < 0)
3247 err = allocated; 3267 err = allocated;
3248 3268
@@ -3256,7 +3276,7 @@ out:
3256 * to an uninitialized extent. 3276 * to an uninitialized extent.
3257 * 3277 *
3258 * Writing to an uninitialized extent may result in splitting the uninitialized 3278 * Writing to an uninitialized extent may result in splitting the uninitialized
3259 * extent into multiple /initialized uninitialized extents (up to three) 3279 * extent into multiple initialized/uninitialized extents (up to three)
3260 * There are three possibilities: 3280 * There are three possibilities:
3261 * a> There is no split required: Entire extent should be uninitialized 3281 * a> There is no split required: Entire extent should be uninitialized
3262 * b> Splits in two extents: Write is happening at either end of the extent 3282 * b> Splits in two extents: Write is happening at either end of the extent
@@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3333 /* note: ext4_ext_correct_indexes() isn't needed here because 3353 /* note: ext4_ext_correct_indexes() isn't needed here because
3334 * borders are not changed 3354 * borders are not changed
3335 */ 3355 */
3336 ext4_ext_try_to_merge(inode, path, ex); 3356 ext4_ext_try_to_merge(handle, inode, path, ex);
3337 3357
3338 /* Mark modified extent as dirty */ 3358 /* Mark modified extent as dirty */
3339 err = ext4_ext_dirty(handle, inode, path + depth); 3359 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3340out: 3360out:
3341 ext4_ext_show_leaf(inode, path); 3361 ext4_ext_show_leaf(inode, path);
3342 return err; 3362 return err;
@@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3600{ 3620{
3601 int ret = 0; 3621 int ret = 0;
3602 int err = 0; 3622 int err = 0;
3603 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3623 ext4_io_end_t *io = ext4_inode_aio(inode);
3604 3624
3605 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " 3625 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3606 "block %llu, max_blocks %u, flags %x, allocated %u\n", 3626 "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3615 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3635 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3616 ret = ext4_split_unwritten_extents(handle, inode, map, 3636 ret = ext4_split_unwritten_extents(handle, inode, map,
3617 path, flags); 3637 path, flags);
3638 if (ret <= 0)
3639 goto out;
3618 /* 3640 /*
3619 * Flag the inode(non aio case) or end_io struct (aio case) 3641 * Flag the inode(non aio case) or end_io struct (aio case)
3620 * that this IO needs to conversion to written when IO is 3642 * that this IO needs to conversion to written when IO is
@@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3858 unsigned int allocated = 0, offset = 0; 3880 unsigned int allocated = 0, offset = 0;
3859 unsigned int allocated_clusters = 0; 3881 unsigned int allocated_clusters = 0;
3860 struct ext4_allocation_request ar; 3882 struct ext4_allocation_request ar;
3861 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3883 ext4_io_end_t *io = ext4_inode_aio(inode);
3862 ext4_lblk_t cluster_offset; 3884 ext4_lblk_t cluster_offset;
3885 int set_unwritten = 0;
3863 3886
3864 ext_debug("blocks %u/%u requested for inode %lu\n", 3887 ext_debug("blocks %u/%u requested for inode %lu\n",
3865 map->m_lblk, map->m_len, inode->i_ino); 3888 map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4105,8 @@ got_allocated_blocks:
4082 * For non asycn direct IO case, flag the inode state 4105 * For non asycn direct IO case, flag the inode state
4083 * that we need to perform conversion when IO is done. 4106 * that we need to perform conversion when IO is done.
4084 */ 4107 */
4085 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4108 if ((flags & EXT4_GET_BLOCKS_PRE_IO))
4086 if (io) 4109 set_unwritten = 1;
4087 ext4_set_io_unwritten_flag(inode, io);
4088 else
4089 ext4_set_inode_state(inode,
4090 EXT4_STATE_DIO_UNWRITTEN);
4091 }
4092 if (ext4_should_dioread_nolock(inode)) 4110 if (ext4_should_dioread_nolock(inode))
4093 map->m_flags |= EXT4_MAP_UNINIT; 4111 map->m_flags |= EXT4_MAP_UNINIT;
4094 } 4112 }
@@ -4100,6 +4118,15 @@ got_allocated_blocks:
4100 if (!err) 4118 if (!err)
4101 err = ext4_ext_insert_extent(handle, inode, path, 4119 err = ext4_ext_insert_extent(handle, inode, path,
4102 &newex, flags); 4120 &newex, flags);
4121
4122 if (!err && set_unwritten) {
4123 if (io)
4124 ext4_set_io_unwritten_flag(inode, io);
4125 else
4126 ext4_set_inode_state(inode,
4127 EXT4_STATE_DIO_UNWRITTEN);
4128 }
4129
4103 if (err && free_on_err) { 4130 if (err && free_on_err) {
4104 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4131 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4105 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4132 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
4241 * finish any pending end_io work so we won't run the risk of 4268 * finish any pending end_io work so we won't run the risk of
4242 * converting any truncated blocks to initialized later 4269 * converting any truncated blocks to initialized later
4243 */ 4270 */
4244 ext4_flush_completed_IO(inode); 4271 ext4_flush_unwritten_io(inode);
4245 4272
4246 /* 4273 /*
4247 * probably first extent we're gonna free will be last in block 4274 * probably first extent we're gonna free will be last in block
@@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4769 loff_t first_page_offset, last_page_offset; 4796 loff_t first_page_offset, last_page_offset;
4770 int credits, err = 0; 4797 int credits, err = 0;
4771 4798
4799 /*
4800 * Write out all dirty pages to avoid race conditions
4801 * Then release them.
4802 */
4803 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4804 err = filemap_write_and_wait_range(mapping,
4805 offset, offset + length - 1);
4806
4807 if (err)
4808 return err;
4809 }
4810
4811 mutex_lock(&inode->i_mutex);
4812 /* It's not possible punch hole on append only file */
4813 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4814 err = -EPERM;
4815 goto out_mutex;
4816 }
4817 if (IS_SWAPFILE(inode)) {
4818 err = -ETXTBSY;
4819 goto out_mutex;
4820 }
4821
4772 /* No need to punch hole beyond i_size */ 4822 /* No need to punch hole beyond i_size */
4773 if (offset >= inode->i_size) 4823 if (offset >= inode->i_size)
4774 return 0; 4824 goto out_mutex;
4775 4825
4776 /* 4826 /*
4777 * If the hole extends beyond i_size, set the hole 4827 * If the hole extends beyond i_size, set the hole
@@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4789 first_page_offset = first_page << PAGE_CACHE_SHIFT; 4839 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4790 last_page_offset = last_page << PAGE_CACHE_SHIFT; 4840 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4791 4841
4792 /*
4793 * Write out all dirty pages to avoid race conditions
4794 * Then release them.
4795 */
4796 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4797 err = filemap_write_and_wait_range(mapping,
4798 offset, offset + length - 1);
4799
4800 if (err)
4801 return err;
4802 }
4803
4804 /* Now release the pages */ 4842 /* Now release the pages */
4805 if (last_page_offset > first_page_offset) { 4843 if (last_page_offset > first_page_offset) {
4806 truncate_pagecache_range(inode, first_page_offset, 4844 truncate_pagecache_range(inode, first_page_offset,
4807 last_page_offset - 1); 4845 last_page_offset - 1);
4808 } 4846 }
4809 4847
4810 /* finish any pending end_io work */ 4848 /* Wait all existing dio workers, newcomers will block on i_mutex */
4811 ext4_flush_completed_IO(inode); 4849 ext4_inode_block_unlocked_dio(inode);
4850 err = ext4_flush_unwritten_io(inode);
4851 if (err)
4852 goto out_dio;
4853 inode_dio_wait(inode);
4812 4854
4813 credits = ext4_writepage_trans_blocks(inode); 4855 credits = ext4_writepage_trans_blocks(inode);
4814 handle = ext4_journal_start(inode, credits); 4856 handle = ext4_journal_start(inode, credits);
4815 if (IS_ERR(handle)) 4857 if (IS_ERR(handle)) {
4816 return PTR_ERR(handle); 4858 err = PTR_ERR(handle);
4859 goto out_dio;
4860 }
4817 4861
4818 err = ext4_orphan_add(handle, inode);
4819 if (err)
4820 goto out;
4821 4862
4822 /* 4863 /*
4823 * Now we need to zero out the non-page-aligned data in the 4864 * Now we need to zero out the non-page-aligned data in the
@@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4903 up_write(&EXT4_I(inode)->i_data_sem); 4944 up_write(&EXT4_I(inode)->i_data_sem);
4904 4945
4905out: 4946out:
4906 ext4_orphan_del(handle, inode);
4907 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4947 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4908 ext4_mark_inode_dirty(handle, inode); 4948 ext4_mark_inode_dirty(handle, inode);
4909 ext4_journal_stop(handle); 4949 ext4_journal_stop(handle);
4950out_dio:
4951 ext4_inode_resume_unlocked_dio(inode);
4952out_mutex:
4953 mutex_unlock(&inode->i_mutex);
4910 return err; 4954 return err;
4911} 4955}
4912int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4956int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3b0e3bdaabfc..bf3966bccd34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode) 58void ext4_unwritten_wait(struct inode *inode)
59{ 59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode); 60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61 61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); 62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
63} 63}
64 64
65/* 65/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
116 "performance will be poor.", 116 "performance will be poor.",
117 inode->i_ino, current->comm); 117 inode->i_ino, current->comm);
118 mutex_lock(ext4_aio_mutex(inode)); 118 mutex_lock(ext4_aio_mutex(inode));
119 ext4_aiodio_wait(inode); 119 ext4_unwritten_wait(inode);
120 } 120 }
121 121
122 BUG_ON(iocb->ki_pos != pos); 122 BUG_ON(iocb->ki_pos != pos);
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
207static const struct vm_operations_struct ext4_file_vm_ops = { 207static const struct vm_operations_struct ext4_file_vm_ops = {
208 .fault = filemap_fault, 208 .fault = filemap_fault,
209 .page_mkwrite = ext4_page_mkwrite, 209 .page_mkwrite = ext4_page_mkwrite,
210 .remap_pages = generic_file_remap_pages,
210}; 211};
211 212
212static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 213static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
217 return -ENOEXEC; 218 return -ENOEXEC;
218 file_accessed(file); 219 file_accessed(file);
219 vma->vm_ops = &ext4_file_vm_ops; 220 vma->vm_ops = &ext4_file_vm_ops;
220 vma->vm_flags |= VM_CAN_NONLINEAR;
221 return 0; 221 return 0;
222} 222}
223 223
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2a1dcea4f12e..be1d89f385b4 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78int ext4_flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 dump_completed_IO(inode);
87 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
88 while (!list_empty(&ei->i_completed_io_list)){
89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list);
91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
93 /*
94 * Calling ext4_end_io_nolock() to convert completed
95 * IO to written.
96 *
97 * When ext4_sync_file() is called, run_queue() may already
98 * about to flush the work corresponding to this io structure.
99 * It will be upset if it founds the io structure related
100 * to the work-to-be schedule is freed.
101 *
102 * Thus we need to keep the io structure still valid here after
103 * conversion finished. The io structure has a flag to
104 * avoid double converting from both fsync and background work
105 * queue work.
106 */
107 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
108 ret = ext4_end_io_nolock(io);
109 if (ret < 0)
110 ret2 = ret;
111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
113 }
114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
115 return (ret2 < 0) ? ret2 : 0;
116}
117
118/* 37/*
119 * If we're not journaling and this is a just-created file, we have to 38 * If we're not journaling and this is a just-created file, we have to
120 * sync our parent directory (if it was freshly created) since 39 * sync our parent directory (if it was freshly created) since
@@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
203 struct inode *inode = file->f_mapping->host; 122 struct inode *inode = file->f_mapping->host;
204 struct ext4_inode_info *ei = EXT4_I(inode); 123 struct ext4_inode_info *ei = EXT4_I(inode);
205 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 124 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
206 int ret; 125 int ret, err;
207 tid_t commit_tid; 126 tid_t commit_tid;
208 bool needs_barrier = false; 127 bool needs_barrier = false;
209 128
@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
219 if (inode->i_sb->s_flags & MS_RDONLY) 138 if (inode->i_sb->s_flags & MS_RDONLY)
220 goto out; 139 goto out;
221 140
222 ret = ext4_flush_completed_IO(inode); 141 ret = ext4_flush_unwritten_io(inode);
223 if (ret < 0) 142 if (ret < 0)
224 goto out; 143 goto out;
225 144
@@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
255 needs_barrier = true; 174 needs_barrier = true;
256 jbd2_log_start_commit(journal, commit_tid); 175 jbd2_log_start_commit(journal, commit_tid);
257 ret = jbd2_log_wait_commit(journal, commit_tid); 176 ret = jbd2_log_wait_commit(journal, commit_tid);
258 if (needs_barrier) 177 if (needs_barrier) {
259 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 178 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
179 if (!ret)
180 ret = err;
181 }
260 out: 182 out:
261 mutex_unlock(&inode->i_mutex); 183 mutex_unlock(&inode->i_mutex);
262 trace_ext4_sync_file_exit(inode, ret); 184 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 26154b81b836..fa36372f3fdf 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -697,6 +697,15 @@ got_group:
697 if (!gdp) 697 if (!gdp)
698 goto fail; 698 goto fail;
699 699
700 /*
701 * Check free inodes count before loading bitmap.
702 */
703 if (ext4_free_inodes_count(sb, gdp) == 0) {
704 if (++group == ngroups)
705 group = 0;
706 continue;
707 }
708
700 brelse(inode_bitmap_bh); 709 brelse(inode_bitmap_bh);
701 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 710 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
702 if (!inode_bitmap_bh) 711 if (!inode_bitmap_bh)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2bf145..792e388e7b44 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
807 807
808retry: 808retry:
809 if (rw == READ && ext4_should_dioread_nolock(inode)) { 809 if (rw == READ && ext4_should_dioread_nolock(inode)) {
810 if (unlikely(!list_empty(&ei->i_completed_io_list))) { 810 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
811 mutex_lock(&inode->i_mutex); 811 mutex_lock(&inode->i_mutex);
812 ext4_flush_completed_IO(inode); 812 ext4_flush_unwritten_io(inode);
813 mutex_unlock(&inode->i_mutex); 813 mutex_unlock(&inode->i_mutex);
814 } 814 }
815 /*
816 * Nolock dioread optimization may be dynamically disabled
817 * via ext4_inode_block_unlocked_dio(). Check inode's state
818 * while holding extra i_dio_count ref.
819 */
820 atomic_inc(&inode->i_dio_count);
821 smp_mb();
822 if (unlikely(ext4_test_inode_state(inode,
823 EXT4_STATE_DIOREAD_LOCK))) {
824 inode_dio_done(inode);
825 goto locked;
826 }
815 ret = __blockdev_direct_IO(rw, iocb, inode, 827 ret = __blockdev_direct_IO(rw, iocb, inode,
816 inode->i_sb->s_bdev, iov, 828 inode->i_sb->s_bdev, iov,
817 offset, nr_segs, 829 offset, nr_segs,
818 ext4_get_block, NULL, NULL, 0); 830 ext4_get_block, NULL, NULL, 0);
831 inode_dio_done(inode);
819 } else { 832 } else {
833locked:
820 ret = blockdev_direct_IO(rw, iocb, inode, iov, 834 ret = blockdev_direct_IO(rw, iocb, inode, iov,
821 offset, nr_segs, ext4_get_block); 835 offset, nr_segs, ext4_get_block);
822 836
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dff171c3a123..b3c243b9afa5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
732 err = ext4_map_blocks(handle, inode, &map, 732 err = ext4_map_blocks(handle, inode, &map,
733 create ? EXT4_GET_BLOCKS_CREATE : 0); 733 create ? EXT4_GET_BLOCKS_CREATE : 0);
734 734
735 /* ensure we send some value back into *errp */
736 *errp = 0;
737
735 if (err < 0) 738 if (err < 0)
736 *errp = err; 739 *errp = err;
737 if (err <= 0) 740 if (err <= 0)
738 return NULL; 741 return NULL;
739 *errp = 0;
740 742
741 bh = sb_getblk(inode->i_sb, map.m_pblk); 743 bh = sb_getblk(inode->i_sb, map.m_pblk);
742 if (!bh) { 744 if (!bh) {
@@ -1954,9 +1956,6 @@ out:
1954 return ret; 1956 return ret;
1955} 1957}
1956 1958
1957static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
1958static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1959
1960/* 1959/*
1961 * Note that we don't need to start a transaction unless we're journaling data 1960 * Note that we don't need to start a transaction unless we're journaling data
1962 * because we should have holes filled from ext4_page_mkwrite(). We even don't 1961 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
2463 free_blocks = EXT4_C2B(sbi, 2462 free_blocks = EXT4_C2B(sbi,
2464 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2463 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2465 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2464 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2465 /*
2466 * Start pushing delalloc when 1/2 of free blocks are dirty.
2467 */
2468 if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
2469 !writeback_in_progress(sb->s_bdi) &&
2470 down_read_trylock(&sb->s_umount)) {
2471 writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2472 up_read(&sb->s_umount);
2473 }
2474
2466 if (2 * free_blocks < 3 * dirty_blocks || 2475 if (2 * free_blocks < 3 * dirty_blocks ||
2467 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2476 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2468 /* 2477 /*
@@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
2471 */ 2480 */
2472 return 1; 2481 return 1;
2473 } 2482 }
2474 /*
2475 * Even if we don't switch but are nearing capacity,
2476 * start pushing delalloc when 1/2 of free blocks are dirty.
2477 */
2478 if (free_blocks < 2 * dirty_blocks)
2479 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2480
2481 return 0; 2483 return 0;
2482} 2484}
2483 2485
@@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2879{ 2881{
2880 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2881 ext4_io_end_t *io_end = iocb->private; 2883 ext4_io_end_t *io_end = iocb->private;
2882 struct workqueue_struct *wq;
2883 unsigned long flags;
2884 struct ext4_inode_info *ei;
2885 2884
2886 /* if not async direct IO or dio with 0 bytes write, just return */ 2885 /* if not async direct IO or dio with 0 bytes write, just return */
2887 if (!io_end || !size) 2886 if (!io_end || !size)
@@ -2910,24 +2909,14 @@ out:
2910 io_end->iocb = iocb; 2909 io_end->iocb = iocb;
2911 io_end->result = ret; 2910 io_end->result = ret;
2912 } 2911 }
2913 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2914 2912
2915 /* Add the io_end to per-inode completed aio dio list*/ 2913 ext4_add_complete_io(io_end);
2916 ei = EXT4_I(io_end->inode);
2917 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2918 list_add_tail(&io_end->list, &ei->i_completed_io_list);
2919 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2920
2921 /* queue the work to convert unwritten extents to written */
2922 queue_work(wq, &io_end->work);
2923} 2914}
2924 2915
2925static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2916static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2926{ 2917{
2927 ext4_io_end_t *io_end = bh->b_private; 2918 ext4_io_end_t *io_end = bh->b_private;
2928 struct workqueue_struct *wq;
2929 struct inode *inode; 2919 struct inode *inode;
2930 unsigned long flags;
2931 2920
2932 if (!test_clear_buffer_uninit(bh) || !io_end) 2921 if (!test_clear_buffer_uninit(bh) || !io_end)
2933 goto out; 2922 goto out;
@@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2946 */ 2935 */
2947 inode = io_end->inode; 2936 inode = io_end->inode;
2948 ext4_set_io_unwritten_flag(inode, io_end); 2937 ext4_set_io_unwritten_flag(inode, io_end);
2949 2938 ext4_add_complete_io(io_end);
2950 /* Add the io_end to per-inode completed io list*/
2951 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2952 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2953 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2954
2955 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2956 /* queue the work to convert unwritten extents to written */
2957 queue_work(wq, &io_end->work);
2958out: 2939out:
2959 bh->b_private = NULL; 2940 bh->b_private = NULL;
2960 bh->b_end_io = NULL; 2941 bh->b_end_io = NULL;
@@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3029 overwrite = *((int *)iocb->private); 3010 overwrite = *((int *)iocb->private);
3030 3011
3031 if (overwrite) { 3012 if (overwrite) {
3013 atomic_inc(&inode->i_dio_count);
3032 down_read(&EXT4_I(inode)->i_data_sem); 3014 down_read(&EXT4_I(inode)->i_data_sem);
3033 mutex_unlock(&inode->i_mutex); 3015 mutex_unlock(&inode->i_mutex);
3034 } 3016 }
@@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3054 * hook to the iocb. 3036 * hook to the iocb.
3055 */ 3037 */
3056 iocb->private = NULL; 3038 iocb->private = NULL;
3057 EXT4_I(inode)->cur_aio_dio = NULL; 3039 ext4_inode_aio_set(inode, NULL);
3058 if (!is_sync_kiocb(iocb)) { 3040 if (!is_sync_kiocb(iocb)) {
3059 ext4_io_end_t *io_end = 3041 ext4_io_end_t *io_end =
3060 ext4_init_io_end(inode, GFP_NOFS); 3042 ext4_init_io_end(inode, GFP_NOFS);
@@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3071 * is a unwritten extents needs to be converted 3053 * is a unwritten extents needs to be converted
3072 * when IO is completed. 3054 * when IO is completed.
3073 */ 3055 */
3074 EXT4_I(inode)->cur_aio_dio = iocb->private; 3056 ext4_inode_aio_set(inode, io_end);
3075 } 3057 }
3076 3058
3077 if (overwrite) 3059 if (overwrite)
@@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3091 NULL, 3073 NULL,
3092 DIO_LOCKING); 3074 DIO_LOCKING);
3093 if (iocb->private) 3075 if (iocb->private)
3094 EXT4_I(inode)->cur_aio_dio = NULL; 3076 ext4_inode_aio_set(inode, NULL);
3095 /* 3077 /*
3096 * The io_end structure takes a reference to the inode, 3078 * The io_end structure takes a reference to the inode,
3097 * that structure needs to be destroyed and the 3079 * that structure needs to be destroyed and the
@@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3126 retake_lock: 3108 retake_lock:
3127 /* take i_mutex locking again if we do a ovewrite dio */ 3109 /* take i_mutex locking again if we do a ovewrite dio */
3128 if (overwrite) { 3110 if (overwrite) {
3111 inode_dio_done(inode);
3129 up_read(&EXT4_I(inode)->i_data_sem); 3112 up_read(&EXT4_I(inode)->i_data_sem);
3130 mutex_lock(&inode->i_mutex); 3113 mutex_lock(&inode->i_mutex);
3131 } 3114 }
@@ -3313,7 +3296,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
3313 * handle: The journal handle 3296 * handle: The journal handle
3314 * inode: The files inode 3297 * inode: The files inode
3315 * page: A locked page that contains the offset "from" 3298 * page: A locked page that contains the offset "from"
3316 * from: The starting byte offset (from the begining of the file) 3299 * from: The starting byte offset (from the beginning of the file)
3317 * to begin discarding 3300 * to begin discarding
3318 * len: The length of bytes to discard 3301 * len: The length of bytes to discard
3319 * flags: Optional flags that may be used: 3302 * flags: Optional flags that may be used:
@@ -3321,11 +3304,11 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
3321 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 3304 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3322 * Only zero the regions of the page whose buffer heads 3305 * Only zero the regions of the page whose buffer heads
3323 * have already been unmapped. This flag is appropriate 3306 * have already been unmapped. This flag is appropriate
3324 * for updateing the contents of a page whose blocks may 3307 * for updating the contents of a page whose blocks may
3325 * have already been released, and we only want to zero 3308 * have already been released, and we only want to zero
3326 * out the regions that correspond to those released blocks. 3309 * out the regions that correspond to those released blocks.
3327 * 3310 *
3328 * Returns zero on sucess or negative on failure. 3311 * Returns zero on success or negative on failure.
3329 */ 3312 */
3330static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3313static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3331 struct inode *inode, struct page *page, loff_t from, 3314 struct inode *inode, struct page *page, loff_t from,
@@ -3486,7 +3469,7 @@ int ext4_can_truncate(struct inode *inode)
3486 * @offset: The offset where the hole will begin 3469 * @offset: The offset where the hole will begin
3487 * @len: The length of the hole 3470 * @len: The length of the hole
3488 * 3471 *
3489 * Returns: 0 on sucess or negative on failure 3472 * Returns: 0 on success or negative on failure
3490 */ 3473 */
3491 3474
3492int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3475int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
@@ -4008,7 +3991,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4008 3991
4009 if (i_blocks <= ~0U) { 3992 if (i_blocks <= ~0U) {
4010 /* 3993 /*
4011 * i_blocks can be represnted in a 32 bit variable 3994 * i_blocks can be represented in a 32 bit variable
4012 * as multiple of 512 bytes 3995 * as multiple of 512 bytes
4013 */ 3996 */
4014 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 3997 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle,
4052 struct ext4_inode_info *ei = EXT4_I(inode); 4035 struct ext4_inode_info *ei = EXT4_I(inode);
4053 struct buffer_head *bh = iloc->bh; 4036 struct buffer_head *bh = iloc->bh;
4054 int err = 0, rc, block; 4037 int err = 0, rc, block;
4038 int need_datasync = 0;
4055 uid_t i_uid; 4039 uid_t i_uid;
4056 gid_t i_gid; 4040 gid_t i_gid;
4057 4041
@@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle,
4102 raw_inode->i_file_acl_high = 4086 raw_inode->i_file_acl_high =
4103 cpu_to_le16(ei->i_file_acl >> 32); 4087 cpu_to_le16(ei->i_file_acl >> 32);
4104 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4088 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4105 ext4_isize_set(raw_inode, ei->i_disksize); 4089 if (ei->i_disksize != ext4_isize(raw_inode)) {
4090 ext4_isize_set(raw_inode, ei->i_disksize);
4091 need_datasync = 1;
4092 }
4106 if (ei->i_disksize > 0x7fffffffULL) { 4093 if (ei->i_disksize > 0x7fffffffULL) {
4107 struct super_block *sb = inode->i_sb; 4094 struct super_block *sb = inode->i_sb;
4108 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4095 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle,
4155 err = rc; 4142 err = rc;
4156 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 4143 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
4157 4144
4158 ext4_update_inode_fsync_trans(handle, inode, 0); 4145 ext4_update_inode_fsync_trans(handle, inode, need_datasync);
4159out_brelse: 4146out_brelse:
4160 brelse(bh); 4147 brelse(bh);
4161 ext4_std_error(inode->i_sb, err); 4148 ext4_std_error(inode->i_sb, err);
@@ -4169,7 +4156,7 @@ out_brelse:
4169 * 4156 *
4170 * - Within generic_file_write() for O_SYNC files. 4157 * - Within generic_file_write() for O_SYNC files.
4171 * Here, there will be no transaction running. We wait for any running 4158 * Here, there will be no transaction running. We wait for any running
4172 * trasnaction to commit. 4159 * transaction to commit.
4173 * 4160 *
4174 * - Within sys_sync(), kupdate and such. 4161 * - Within sys_sync(), kupdate and such.
4175 * We wait on commit, if tol to. 4162 * We wait on commit, if tol to.
@@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4298 } 4285 }
4299 4286
4300 if (attr->ia_valid & ATTR_SIZE) { 4287 if (attr->ia_valid & ATTR_SIZE) {
4301 inode_dio_wait(inode);
4302 4288
4303 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4289 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4304 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4290 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4347 } 4333 }
4348 4334
4349 if (attr->ia_valid & ATTR_SIZE) { 4335 if (attr->ia_valid & ATTR_SIZE) {
4350 if (attr->ia_size != i_size_read(inode)) 4336 if (attr->ia_size != i_size_read(inode)) {
4351 truncate_setsize(inode, attr->ia_size); 4337 truncate_setsize(inode, attr->ia_size);
4338 /* Inode size will be reduced, wait for dio in flight.
4339 * Temporarily disable dioread_nolock to prevent
4340 * livelock. */
4341 if (orphan) {
4342 ext4_inode_block_unlocked_dio(inode);
4343 inode_dio_wait(inode);
4344 ext4_inode_resume_unlocked_dio(inode);
4345 }
4346 }
4352 ext4_truncate(inode); 4347 ext4_truncate(inode);
4353 } 4348 }
4354 4349
@@ -4413,7 +4408,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4413 * worse case, the indexs blocks spread over different block groups 4408 * worse case, the indexs blocks spread over different block groups
4414 * 4409 *
4415 * If datablocks are discontiguous, they are possible to spread over 4410 * If datablocks are discontiguous, they are possible to spread over
4416 * different block groups too. If they are contiuguous, with flexbg, 4411 * different block groups too. If they are contiguous, with flexbg,
4417 * they could still across block group boundary. 4412 * they could still across block group boundary.
4418 * 4413 *
4419 * Also account for superblock, inode, quota and xattr blocks 4414 * Also account for superblock, inode, quota and xattr blocks
@@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4727 return err; 4722 return err;
4728 } 4723 }
4729 4724
4725 /* Wait for all existing dio workers */
4726 ext4_inode_block_unlocked_dio(inode);
4727 inode_dio_wait(inode);
4728
4730 jbd2_journal_lock_updates(journal); 4729 jbd2_journal_lock_updates(journal);
4731 4730
4732 /* 4731 /*
@@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4746 ext4_set_aops(inode); 4745 ext4_set_aops(inode);
4747 4746
4748 jbd2_journal_unlock_updates(journal); 4747 jbd2_journal_unlock_updates(journal);
4748 ext4_inode_resume_unlocked_dio(inode);
4749 4749
4750 /* Finally we can mark the inode as dirty. */ 4750 /* Finally we can mark the inode as dirty. */
4751 4751
@@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4780 int retries = 0; 4780 int retries = 0;
4781 4781
4782 sb_start_pagefault(inode->i_sb); 4782 sb_start_pagefault(inode->i_sb);
4783 file_update_time(vma->vm_file);
4783 /* Delalloc case is easy... */ 4784 /* Delalloc case is easy... */
4784 if (test_opt(inode->i_sb, DELALLOC) && 4785 if (test_opt(inode->i_sb, DELALLOC) &&
4785 !ext4_should_journal_data(inode) && 4786 !ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7f7dad787603..5747f52f7c72 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -233,7 +233,7 @@ group_extend_out:
233 233
234 case EXT4_IOC_MOVE_EXT: { 234 case EXT4_IOC_MOVE_EXT: {
235 struct move_extent me; 235 struct move_extent me;
236 struct file *donor_filp; 236 struct fd donor;
237 int err; 237 int err;
238 238
239 if (!(filp->f_mode & FMODE_READ) || 239 if (!(filp->f_mode & FMODE_READ) ||
@@ -245,11 +245,11 @@ group_extend_out:
245 return -EFAULT; 245 return -EFAULT;
246 me.moved_len = 0; 246 me.moved_len = 0;
247 247
248 donor_filp = fget(me.donor_fd); 248 donor = fdget(me.donor_fd);
249 if (!donor_filp) 249 if (!donor.file)
250 return -EBADF; 250 return -EBADF;
251 251
252 if (!(donor_filp->f_mode & FMODE_WRITE)) { 252 if (!(donor.file->f_mode & FMODE_WRITE)) {
253 err = -EBADF; 253 err = -EBADF;
254 goto mext_out; 254 goto mext_out;
255 } 255 }
@@ -258,14 +258,15 @@ group_extend_out:
258 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { 258 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
259 ext4_msg(sb, KERN_ERR, 259 ext4_msg(sb, KERN_ERR,
260 "Online defrag not supported with bigalloc"); 260 "Online defrag not supported with bigalloc");
261 return -EOPNOTSUPP; 261 err = -EOPNOTSUPP;
262 goto mext_out;
262 } 263 }
263 264
264 err = mnt_want_write_file(filp); 265 err = mnt_want_write_file(filp);
265 if (err) 266 if (err)
266 goto mext_out; 267 goto mext_out;
267 268
268 err = ext4_move_extents(filp, donor_filp, me.orig_start, 269 err = ext4_move_extents(filp, donor.file, me.orig_start,
269 me.donor_start, me.len, &me.moved_len); 270 me.donor_start, me.len, &me.moved_len);
270 mnt_drop_write_file(filp); 271 mnt_drop_write_file(filp);
271 272
@@ -273,7 +274,7 @@ group_extend_out:
273 &me, sizeof(me))) 274 &me, sizeof(me)))
274 err = -EFAULT; 275 err = -EFAULT;
275mext_out: 276mext_out:
276 fput(donor_filp); 277 fdput(donor);
277 return err; 278 return err;
278 } 279 }
279 280
@@ -365,26 +366,11 @@ group_add_out:
365 return -EOPNOTSUPP; 366 return -EOPNOTSUPP;
366 } 367 }
367 368
368 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
369 EXT4_FEATURE_INCOMPAT_META_BG)) {
370 ext4_msg(sb, KERN_ERR,
371 "Online resizing not (yet) supported with meta_bg");
372 return -EOPNOTSUPP;
373 }
374
375 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, 369 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
376 sizeof(__u64))) { 370 sizeof(__u64))) {
377 return -EFAULT; 371 return -EFAULT;
378 } 372 }
379 373
380 if (n_blocks_count > MAX_32_NUM &&
381 !EXT4_HAS_INCOMPAT_FEATURE(sb,
382 EXT4_FEATURE_INCOMPAT_64BIT)) {
383 ext4_msg(sb, KERN_ERR,
384 "File system only supports 32-bit block numbers");
385 return -EOPNOTSUPP;
386 }
387
388 err = ext4_resize_begin(sb); 374 err = ext4_resize_begin(sb);
389 if (err) 375 if (err)
390 return err; 376 return err;
@@ -419,13 +405,6 @@ resizefs_out:
419 if (!blk_queue_discard(q)) 405 if (!blk_queue_discard(q))
420 return -EOPNOTSUPP; 406 return -EOPNOTSUPP;
421 407
422 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
423 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
424 ext4_msg(sb, KERN_ERR,
425 "FITRIM not supported with bigalloc");
426 return -EOPNOTSUPP;
427 }
428
429 if (copy_from_user(&range, (struct fstrim_range __user *)arg, 408 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
430 sizeof(range))) 409 sizeof(range)))
431 return -EFAULT; 410 return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8eae94771c45..f8b27bf80aca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -24,6 +24,7 @@
24#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
25#include "mballoc.h" 25#include "mballoc.h"
26#include <linux/debugfs.h> 26#include <linux/debugfs.h>
27#include <linux/log2.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <trace/events/ext4.h> 29#include <trace/events/ext4.h>
29 30
@@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1338 mb_check_buddy(e4b); 1339 mb_check_buddy(e4b);
1339} 1340}
1340 1341
1341static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1342static int mb_find_extent(struct ext4_buddy *e4b, int block,
1342 int needed, struct ext4_free_extent *ex) 1343 int needed, struct ext4_free_extent *ex)
1343{ 1344{
1344 int next = block; 1345 int next = block;
1345 int max; 1346 int max, order;
1346 void *buddy; 1347 void *buddy;
1347 1348
1348 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1349 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1349 BUG_ON(ex == NULL); 1350 BUG_ON(ex == NULL);
1350 1351
1351 buddy = mb_find_buddy(e4b, order, &max); 1352 buddy = mb_find_buddy(e4b, 0, &max);
1352 BUG_ON(buddy == NULL); 1353 BUG_ON(buddy == NULL);
1353 BUG_ON(block >= max); 1354 BUG_ON(block >= max);
1354 if (mb_test_bit(block, buddy)) { 1355 if (mb_test_bit(block, buddy)) {
@@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1358 return 0; 1359 return 0;
1359 } 1360 }
1360 1361
1361 /* FIXME dorp order completely ? */ 1362 /* find actual order */
1362 if (likely(order == 0)) { 1363 order = mb_find_order_for_block(e4b, block);
1363 /* find actual order */ 1364 block = block >> order;
1364 order = mb_find_order_for_block(e4b, block);
1365 block = block >> order;
1366 }
1367 1365
1368 ex->fe_len = 1 << order; 1366 ex->fe_len = 1 << order;
1369 ex->fe_start = block << order; 1367 ex->fe_start = block << order;
@@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1549 /* recheck chunk's availability - we don't know 1547 /* recheck chunk's availability - we don't know
1550 * when it was found (within this lock-unlock 1548 * when it was found (within this lock-unlock
1551 * period or not) */ 1549 * period or not) */
1552 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1550 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1553 if (max >= gex->fe_len) { 1551 if (max >= gex->fe_len) {
1554 ext4_mb_use_best_found(ac, e4b); 1552 ext4_mb_use_best_found(ac, e4b);
1555 return; 1553 return;
@@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1641 return err; 1639 return err;
1642 1640
1643 ext4_lock_group(ac->ac_sb, group); 1641 ext4_lock_group(ac->ac_sb, group);
1644 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1642 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1645 1643
1646 if (max > 0) { 1644 if (max > 0) {
1647 ac->ac_b_ex = ex; 1645 ac->ac_b_ex = ex;
@@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1662 int max; 1660 int max;
1663 int err; 1661 int err;
1664 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1662 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1663 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1665 struct ext4_free_extent ex; 1664 struct ext4_free_extent ex;
1666 1665
1667 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1666 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1668 return 0; 1667 return 0;
1668 if (grp->bb_free == 0)
1669 return 0;
1669 1670
1670 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1671 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1671 if (err) 1672 if (err)
1672 return err; 1673 return err;
1673 1674
1674 ext4_lock_group(ac->ac_sb, group); 1675 ext4_lock_group(ac->ac_sb, group);
1675 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1676 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1676 ac->ac_g_ex.fe_len, &ex); 1677 ac->ac_g_ex.fe_len, &ex);
1677 1678
1678 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1679 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
@@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1788 break; 1789 break;
1789 } 1790 }
1790 1791
1791 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1792 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1792 BUG_ON(ex.fe_len <= 0); 1793 BUG_ON(ex.fe_len <= 0);
1793 if (free < ex.fe_len) { 1794 if (free < ex.fe_len) {
1794 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1795 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
@@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1840 1841
1841 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 1842 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1842 if (!mb_test_bit(i, bitmap)) { 1843 if (!mb_test_bit(i, bitmap)) {
1843 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1844 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1844 if (max >= sbi->s_stripe) { 1845 if (max >= sbi->s_stripe) {
1845 ac->ac_found++; 1846 ac->ac_found++;
1846 ac->ac_b_ex = ex; 1847 ac->ac_b_ex = ex;
@@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1862 1863
1863 BUG_ON(cr < 0 || cr >= 4); 1864 BUG_ON(cr < 0 || cr >= 4);
1864 1865
1866 free = grp->bb_free;
1867 if (free == 0)
1868 return 0;
1869 if (cr <= 2 && free < ac->ac_g_ex.fe_len)
1870 return 0;
1871
1865 /* We only do this if the grp has never been initialized */ 1872 /* We only do this if the grp has never been initialized */
1866 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1873 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1867 int ret = ext4_mb_init_group(ac->ac_sb, group); 1874 int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1869 return 0; 1876 return 0;
1870 } 1877 }
1871 1878
1872 free = grp->bb_free;
1873 fragments = grp->bb_fragments; 1879 fragments = grp->bb_fragments;
1874 if (free == 0)
1875 return 0;
1876 if (fragments == 0) 1880 if (fragments == 0)
1877 return 0; 1881 return 0;
1878 1882
@@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2163 return cachep; 2167 return cachep;
2164} 2168}
2165 2169
2170/*
2171 * Allocate the top-level s_group_info array for the specified number
2172 * of groups
2173 */
2174int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2175{
2176 struct ext4_sb_info *sbi = EXT4_SB(sb);
2177 unsigned size;
2178 struct ext4_group_info ***new_groupinfo;
2179
2180 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2181 EXT4_DESC_PER_BLOCK_BITS(sb);
2182 if (size <= sbi->s_group_info_size)
2183 return 0;
2184
2185 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2186 new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2187 if (!new_groupinfo) {
2188 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2189 return -ENOMEM;
2190 }
2191 if (sbi->s_group_info) {
2192 memcpy(new_groupinfo, sbi->s_group_info,
2193 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2194 ext4_kvfree(sbi->s_group_info);
2195 }
2196 sbi->s_group_info = new_groupinfo;
2197 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2198 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2199 sbi->s_group_info_size);
2200 return 0;
2201}
2202
2166/* Create and initialize ext4_group_info data for the given group. */ 2203/* Create and initialize ext4_group_info data for the given group. */
2167int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2204int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2168 struct ext4_group_desc *desc) 2205 struct ext4_group_desc *desc)
@@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2195 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2232 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2196 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2233 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2197 2234
2198 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2235 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
2199 if (meta_group_info[i] == NULL) { 2236 if (meta_group_info[i] == NULL) {
2200 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2237 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2201 goto exit_group_info; 2238 goto exit_group_info;
2202 } 2239 }
2203 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2204 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2240 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2205 &(meta_group_info[i]->bb_state)); 2241 &(meta_group_info[i]->bb_state));
2206 2242
@@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
2252 ext4_group_t ngroups = ext4_get_groups_count(sb); 2288 ext4_group_t ngroups = ext4_get_groups_count(sb);
2253 ext4_group_t i; 2289 ext4_group_t i;
2254 struct ext4_sb_info *sbi = EXT4_SB(sb); 2290 struct ext4_sb_info *sbi = EXT4_SB(sb);
2255 struct ext4_super_block *es = sbi->s_es; 2291 int err;
2256 int num_meta_group_infos;
2257 int num_meta_group_infos_max;
2258 int array_size;
2259 struct ext4_group_desc *desc; 2292 struct ext4_group_desc *desc;
2260 struct kmem_cache *cachep; 2293 struct kmem_cache *cachep;
2261 2294
2262 /* This is the number of blocks used by GDT */ 2295 err = ext4_mb_alloc_groupinfo(sb, ngroups);
2263 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2296 if (err)
2264 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2297 return err;
2265
2266 /*
2267 * This is the total number of blocks used by GDT including
2268 * the number of reserved blocks for GDT.
2269 * The s_group_info array is allocated with this value
2270 * to allow a clean online resize without a complex
2271 * manipulation of pointer.
2272 * The drawback is the unused memory when no resize
2273 * occurs but it's very low in terms of pages
2274 * (see comments below)
2275 * Need to handle this properly when META_BG resizing is allowed
2276 */
2277 num_meta_group_infos_max = num_meta_group_infos +
2278 le16_to_cpu(es->s_reserved_gdt_blocks);
2279 2298
2280 /*
2281 * array_size is the size of s_group_info array. We round it
2282 * to the next power of two because this approximation is done
2283 * internally by kmalloc so we can have some more memory
2284 * for free here (e.g. may be used for META_BG resize).
2285 */
2286 array_size = 1;
2287 while (array_size < sizeof(*sbi->s_group_info) *
2288 num_meta_group_infos_max)
2289 array_size = array_size << 1;
2290 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2291 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2292 * So a two level scheme suffices for now. */
2293 sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
2294 if (sbi->s_group_info == NULL) {
2295 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2296 return -ENOMEM;
2297 }
2298 sbi->s_buddy_cache = new_inode(sb); 2299 sbi->s_buddy_cache = new_inode(sb);
2299 if (sbi->s_buddy_cache == NULL) { 2300 if (sbi->s_buddy_cache == NULL) {
2300 ext4_msg(sb, KERN_ERR, "can't get new inode"); 2301 ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2322,7 +2323,7 @@ err_freebuddy:
2322 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2323 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2323 while (i-- > 0) 2324 while (i-- > 0)
2324 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2325 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2325 i = num_meta_group_infos; 2326 i = sbi->s_group_info_size;
2326 while (i-- > 0) 2327 while (i-- > 0)
2327 kfree(sbi->s_group_info[i]); 2328 kfree(sbi->s_group_info[i]);
2328 iput(sbi->s_buddy_cache); 2329 iput(sbi->s_buddy_cache);
@@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4008 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4009 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4009 4010
4010 /* set up allocation goals */ 4011 /* set up allocation goals */
4011 memset(ac, 0, sizeof(struct ext4_allocation_context));
4012 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); 4012 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
4013 ac->ac_status = AC_STATUS_CONTINUE; 4013 ac->ac_status = AC_STATUS_CONTINUE;
4014 ac->ac_sb = sb; 4014 ac->ac_sb = sb;
@@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4291 } 4291 }
4292 } 4292 }
4293 4293
4294 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4294 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4295 if (!ac) { 4295 if (!ac) {
4296 ar->len = 0; 4296 ar->len = 0;
4297 *errp = -ENOMEM; 4297 *errp = -ENOMEM;
@@ -4657,6 +4657,8 @@ do_more:
4657 * with group lock held. generate_buddy look at 4657 * with group lock held. generate_buddy look at
4658 * them with group lock_held 4658 * them with group lock_held
4659 */ 4659 */
4660 if (test_opt(sb, DISCARD))
4661 ext4_issue_discard(sb, block_group, bit, count);
4660 ext4_lock_group(sb, block_group); 4662 ext4_lock_group(sb, block_group);
4661 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4663 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4662 mb_free_blocks(inode, &e4b, bit, count_clusters); 4664 mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4709,7 +4711,7 @@ error_return:
4709 * ext4_group_add_blocks() -- Add given blocks to an existing group 4711 * ext4_group_add_blocks() -- Add given blocks to an existing group
4710 * @handle: handle to this transaction 4712 * @handle: handle to this transaction
4711 * @sb: super block 4713 * @sb: super block
4712 * @block: start physcial block to add to the block group 4714 * @block: start physical block to add to the block group
4713 * @count: number of blocks to free 4715 * @count: number of blocks to free
4714 * 4716 *
4715 * This marks the blocks as free in the bitmap and buddy. 4717 * This marks the blocks as free in the bitmap and buddy.
@@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4988 4990
4989 start = range->start >> sb->s_blocksize_bits; 4991 start = range->start >> sb->s_blocksize_bits;
4990 end = start + (range->len >> sb->s_blocksize_bits) - 1; 4992 end = start + (range->len >> sb->s_blocksize_bits) - 1;
4991 minlen = range->minlen >> sb->s_blocksize_bits; 4993 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
4994 range->minlen >> sb->s_blocksize_bits);
4992 4995
4993 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || 4996 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
4994 unlikely(start >= max_blks)) 4997 unlikely(start >= max_blks))
@@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5048 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5051 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5049 5052
5050out: 5053out:
5051 range->len = trimmed * sb->s_blocksize; 5054 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5052 return ret; 5055 return ret;
5053} 5056}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c070618c21ce..3ccd889ba953 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
65#define MB_DEFAULT_MIN_TO_SCAN 10 65#define MB_DEFAULT_MIN_TO_SCAN 10
66 66
67/* 67/*
68 * How many groups mballoc will scan looking for the best chunk
69 */
70#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
71
72/*
73 * with 'ext4_mb_stats' allocator will collect stats that will be 68 * with 'ext4_mb_stats' allocator will collect stats that will be
74 * shown at umount. The collecting costs though! 69 * shown at umount. The collecting costs though!
75 */ 70 */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c623e7a..292daeeed455 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
141} 141}
142 142
143/** 143/**
144 * mext_check_null_inode - NULL check for two inodes
145 *
146 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
147 */
148static int
149mext_check_null_inode(struct inode *inode1, struct inode *inode2,
150 const char *function, unsigned int line)
151{
152 int ret = 0;
153
154 if (inode1 == NULL) {
155 __ext4_error(inode2->i_sb, function, line,
156 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO;
159 } else if (inode2 == NULL) {
160 __ext4_error(inode1->i_sb, function, line,
161 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO;
164 }
165 return ret;
166}
167
168/**
169 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 144 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
170 * 145 *
171 * @orig_inode: original inode structure 146 * Acquire write lock of i_data_sem of the two inodes
172 * @donor_inode: donor inode structure
173 * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
174 * i_ino order.
175 */ 147 */
176static void 148static void
177double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 149double_down_write_data_sem(struct inode *first, struct inode *second)
178{ 150{
179 struct inode *first = orig_inode, *second = donor_inode; 151 if (first < second) {
152 down_write(&EXT4_I(first)->i_data_sem);
153 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
154 } else {
155 down_write(&EXT4_I(second)->i_data_sem);
156 down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
180 157
181 /*
182 * Use the inode number to provide the stable locking order instead
183 * of its address, because the C language doesn't guarantee you can
184 * compare pointers that don't come from the same array.
185 */
186 if (donor_inode->i_ino < orig_inode->i_ino) {
187 first = donor_inode;
188 second = orig_inode;
189 } 158 }
190
191 down_write(&EXT4_I(first)->i_data_sem);
192 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
193} 159}
194 160
195/** 161/**
@@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 570 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
605 571
606 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); 572 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
607 tmp_dext->ee_block = 573 le32_add_cpu(&tmp_dext->ee_block, diff);
608 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 574 le16_add_cpu(&tmp_dext->ee_len, -diff);
609 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
610 575
611 if (max_count < ext4_ext_get_actual_len(tmp_dext)) 576 if (max_count < ext4_ext_get_actual_len(tmp_dext))
612 tmp_dext->ee_len = cpu_to_le16(max_count); 577 tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
629} 594}
630 595
631/** 596/**
597 * mext_check_coverage - Check that all extents in range has the same type
598 *
599 * @inode: inode in question
600 * @from: block offset of inode
601 * @count: block count to be checked
602 * @uninit: extents expected to be uninitialized
603 * @err: pointer to save error value
604 *
605 * Return 1 if all extents in range has expected type, and zero otherwise.
606 */
607static int
608mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
609 int uninit, int *err)
610{
611 struct ext4_ext_path *path = NULL;
612 struct ext4_extent *ext;
613 ext4_lblk_t last = from + count;
614 while (from < last) {
615 *err = get_ext_path(inode, from, &path);
616 if (*err)
617 return 0;
618 ext = path[ext_depth(inode)].p_ext;
619 if (!ext) {
620 ext4_ext_drop_refs(path);
621 return 0;
622 }
623 if (uninit != ext4_ext_is_uninitialized(ext)) {
624 ext4_ext_drop_refs(path);
625 return 0;
626 }
627 from += ext4_ext_get_actual_len(ext);
628 ext4_ext_drop_refs(path);
629 }
630 return 1;
631}
632
633/**
632 * mext_replace_branches - Replace original extents with new extents 634 * mext_replace_branches - Replace original extents with new extents
633 * 635 *
634 * @handle: journal handle 636 * @handle: journal handle
@@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
663 int replaced_count = 0; 665 int replaced_count = 0;
664 int dext_alen; 666 int dext_alen;
665 667
666 /* Protect extent trees against block allocations via delalloc */
667 double_down_write_data_sem(orig_inode, donor_inode);
668
669 /* Get the original extent for the block "orig_off" */ 668 /* Get the original extent for the block "orig_off" */
670 *err = get_ext_path(orig_inode, orig_off, &orig_path); 669 *err = get_ext_path(orig_inode, orig_off, &orig_path);
671 if (*err) 670 if (*err)
@@ -764,12 +763,122 @@ out:
764 ext4_ext_invalidate_cache(orig_inode); 763 ext4_ext_invalidate_cache(orig_inode);
765 ext4_ext_invalidate_cache(donor_inode); 764 ext4_ext_invalidate_cache(donor_inode);
766 765
767 double_up_write_data_sem(orig_inode, donor_inode);
768
769 return replaced_count; 766 return replaced_count;
770} 767}
771 768
772/** 769/**
770 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
771 *
772 * @inode1: the inode structure
773 * @inode2: the inode structure
774 * @index: page index
775 * @page: result page vector
776 *
777 * Grab two locked pages for inode's by inode order
778 */
779static int
780mext_page_double_lock(struct inode *inode1, struct inode *inode2,
781 pgoff_t index, struct page *page[2])
782{
783 struct address_space *mapping[2];
784 unsigned fl = AOP_FLAG_NOFS;
785
786 BUG_ON(!inode1 || !inode2);
787 if (inode1 < inode2) {
788 mapping[0] = inode1->i_mapping;
789 mapping[1] = inode2->i_mapping;
790 } else {
791 mapping[0] = inode2->i_mapping;
792 mapping[1] = inode1->i_mapping;
793 }
794
795 page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
796 if (!page[0])
797 return -ENOMEM;
798
799 page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
800 if (!page[1]) {
801 unlock_page(page[0]);
802 page_cache_release(page[0]);
803 return -ENOMEM;
804 }
805
806 if (inode1 > inode2) {
807 struct page *tmp;
808 tmp = page[0];
809 page[0] = page[1];
810 page[1] = tmp;
811 }
812 return 0;
813}
814
815/* Force page buffers uptodate w/o dropping page's lock */
816static int
817mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
818{
819 struct inode *inode = page->mapping->host;
820 sector_t block;
821 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
822 unsigned int blocksize, block_start, block_end;
823 int i, err, nr = 0, partial = 0;
824 BUG_ON(!PageLocked(page));
825 BUG_ON(PageWriteback(page));
826
827 if (PageUptodate(page))
828 return 0;
829
830 blocksize = 1 << inode->i_blkbits;
831 if (!page_has_buffers(page))
832 create_empty_buffers(page, blocksize, 0);
833
834 head = page_buffers(page);
835 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
836 for (bh = head, block_start = 0; bh != head || !block_start;
837 block++, block_start = block_end, bh = bh->b_this_page) {
838 block_end = block_start + blocksize;
839 if (block_end <= from || block_start >= to) {
840 if (!buffer_uptodate(bh))
841 partial = 1;
842 continue;
843 }
844 if (buffer_uptodate(bh))
845 continue;
846 if (!buffer_mapped(bh)) {
847 int err = 0;
848 err = ext4_get_block(inode, block, bh, 0);
849 if (err) {
850 SetPageError(page);
851 return err;
852 }
853 if (!buffer_mapped(bh)) {
854 zero_user(page, block_start, blocksize);
855 if (!err)
856 set_buffer_uptodate(bh);
857 continue;
858 }
859 }
860 BUG_ON(nr >= MAX_BUF_PER_PAGE);
861 arr[nr++] = bh;
862 }
863 /* No io required */
864 if (!nr)
865 goto out;
866
867 for (i = 0; i < nr; i++) {
868 bh = arr[i];
869 if (!bh_uptodate_or_lock(bh)) {
870 err = bh_submit_read(bh);
871 if (err)
872 return err;
873 }
874 }
875out:
876 if (!partial)
877 SetPageUptodate(page);
878 return 0;
879}
880
881/**
773 * move_extent_per_page - Move extent data per page 882 * move_extent_per_page - Move extent data per page
774 * 883 *
775 * @o_filp: file structure of original file 884 * @o_filp: file structure of original file
@@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
791 int block_len_in_page, int uninit, int *err) 900 int block_len_in_page, int uninit, int *err)
792{ 901{
793 struct inode *orig_inode = o_filp->f_dentry->d_inode; 902 struct inode *orig_inode = o_filp->f_dentry->d_inode;
794 struct address_space *mapping = orig_inode->i_mapping; 903 struct page *pagep[2] = {NULL, NULL};
795 struct buffer_head *bh;
796 struct page *page = NULL;
797 const struct address_space_operations *a_ops = mapping->a_ops;
798 handle_t *handle; 904 handle_t *handle;
799 ext4_lblk_t orig_blk_offset; 905 ext4_lblk_t orig_blk_offset;
800 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 906 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
801 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 907 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
802 unsigned int w_flags = 0; 908 unsigned int w_flags = 0;
803 unsigned int tmp_data_size, data_size, replaced_size; 909 unsigned int tmp_data_size, data_size, replaced_size;
804 void *fsdata; 910 int err2, jblocks, retries = 0;
805 int i, jblocks;
806 int err2 = 0;
807 int replaced_count = 0; 911 int replaced_count = 0;
912 int from = data_offset_in_page << orig_inode->i_blkbits;
808 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 913 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
809 914
810 /* 915 /*
811 * It needs twice the amount of ordinary journal buffers because 916 * It needs twice the amount of ordinary journal buffers because
812 * inode and donor_inode may change each different metadata blocks. 917 * inode and donor_inode may change each different metadata blocks.
813 */ 918 */
919again:
920 *err = 0;
814 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 921 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
815 handle = ext4_journal_start(orig_inode, jblocks); 922 handle = ext4_journal_start(orig_inode, jblocks);
816 if (IS_ERR(handle)) { 923 if (IS_ERR(handle)) {
@@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
824 orig_blk_offset = orig_page_offset * blocks_per_page + 931 orig_blk_offset = orig_page_offset * blocks_per_page +
825 data_offset_in_page; 932 data_offset_in_page;
826 933
827 /*
828 * If orig extent is uninitialized one,
829 * it's not necessary force the page into memory
830 * and then force it to be written out again.
831 * Just swap data blocks between orig and donor.
832 */
833 if (uninit) {
834 replaced_count = mext_replace_branches(handle, orig_inode,
835 donor_inode, orig_blk_offset,
836 block_len_in_page, err);
837 goto out2;
838 }
839
840 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 934 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
841 935
842 /* Calculate data_size */ 936 /* Calculate data_size */
@@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
858 952
859 replaced_size = data_size; 953 replaced_size = data_size;
860 954
861 *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 955 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
862 &page, &fsdata); 956 pagep);
863 if (unlikely(*err < 0)) 957 if (unlikely(*err < 0))
864 goto out; 958 goto stop_journal;
865
866 if (!PageUptodate(page)) {
867 mapping->a_ops->readpage(o_filp, page);
868 lock_page(page);
869 }
870
871 /* 959 /*
872 * try_to_release_page() doesn't call releasepage in writeback mode. 960 * If orig extent was uninitialized it can become initialized
873 * We should care about the order of writing to the same file 961 * at any time after i_data_sem was dropped, in order to
874 * by multiple move extent processes. 962 * serialize with delalloc we have recheck extent while we
875 * It needs to call wait_on_page_writeback() to wait for the 963 * hold page's lock, if it is still the case data copy is not
876 * writeback of the page. 964 * necessary, just swap data blocks between orig and donor.
877 */ 965 */
878 wait_on_page_writeback(page); 966 if (uninit) {
967 double_down_write_data_sem(orig_inode, donor_inode);
968 /* If any of extents in range became initialized we have to
969 * fallback to data copying */
970 uninit = mext_check_coverage(orig_inode, orig_blk_offset,
971 block_len_in_page, 1, err);
972 if (*err)
973 goto drop_data_sem;
879 974
880 /* Release old bh and drop refs */ 975 uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
881 try_to_release_page(page, 0); 976 block_len_in_page, 1, err);
977 if (*err)
978 goto drop_data_sem;
979
980 if (!uninit) {
981 double_up_write_data_sem(orig_inode, donor_inode);
982 goto data_copy;
983 }
984 if ((page_has_private(pagep[0]) &&
985 !try_to_release_page(pagep[0], 0)) ||
986 (page_has_private(pagep[1]) &&
987 !try_to_release_page(pagep[1], 0))) {
988 *err = -EBUSY;
989 goto drop_data_sem;
990 }
991 replaced_count = mext_replace_branches(handle, orig_inode,
992 donor_inode, orig_blk_offset,
993 block_len_in_page, err);
994 drop_data_sem:
995 double_up_write_data_sem(orig_inode, donor_inode);
996 goto unlock_pages;
997 }
998data_copy:
999 *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
1000 if (*err)
1001 goto unlock_pages;
1002
1003 /* At this point all buffers in range are uptodate, old mapping layout
1004 * is no longer required, try to drop it now. */
1005 if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
1006 (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
1007 *err = -EBUSY;
1008 goto unlock_pages;
1009 }
882 1010
883 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 1011 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
884 orig_blk_offset, block_len_in_page, 1012 orig_blk_offset,
885 &err2); 1013 block_len_in_page, err);
886 if (err2) { 1014 if (*err) {
887 if (replaced_count) { 1015 if (replaced_count) {
888 block_len_in_page = replaced_count; 1016 block_len_in_page = replaced_count;
889 replaced_size = 1017 replaced_size =
890 block_len_in_page << orig_inode->i_blkbits; 1018 block_len_in_page << orig_inode->i_blkbits;
891 } else 1019 } else
892 goto out; 1020 goto unlock_pages;
893 } 1021 }
1022 /* Perform all necessary steps similar write_begin()/write_end()
1023 * but keeping in mind that i_size will not change */
1024 *err = __block_write_begin(pagep[0], from, from + replaced_size,
1025 ext4_get_block);
1026 if (!*err)
1027 *err = block_commit_write(pagep[0], from, from + replaced_size);
894 1028
895 if (!page_has_buffers(page)) 1029 if (unlikely(*err < 0))
896 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 1030 goto repair_branches;
897 1031
898 bh = page_buffers(page); 1032 /* Even in case of data=writeback it is reasonable to pin
899 for (i = 0; i < data_offset_in_page; i++) 1033 * inode to transaction, to prevent unexpected data loss */
900 bh = bh->b_this_page; 1034 *err = ext4_jbd2_file_inode(handle, orig_inode);
901 1035
902 for (i = 0; i < block_len_in_page; i++) { 1036unlock_pages:
903 *err = ext4_get_block(orig_inode, 1037 unlock_page(pagep[0]);
904 (sector_t)(orig_blk_offset + i), bh, 0); 1038 page_cache_release(pagep[0]);
905 if (*err < 0) 1039 unlock_page(pagep[1]);
906 goto out; 1040 page_cache_release(pagep[1]);
907 1041stop_journal:
908 if (bh->b_this_page != NULL)
909 bh = bh->b_this_page;
910 }
911
912 *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
913 page, fsdata);
914 page = NULL;
915
916out:
917 if (unlikely(page)) {
918 if (PageLocked(page))
919 unlock_page(page);
920 page_cache_release(page);
921 ext4_journal_stop(handle);
922 }
923out2:
924 ext4_journal_stop(handle); 1042 ext4_journal_stop(handle);
925 1043 /* Buffer was busy because probably is pinned to journal transaction,
926 if (err2) 1044 * force transaction commit may help to free it. */
927 *err = err2; 1045 if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
928 1046 &retries))
1047 goto again;
929 return replaced_count; 1048 return replaced_count;
1049
1050repair_branches:
1051 /*
1052 * This should never ever happen!
1053 * Extents are swapped already, but we are not able to copy data.
1054 * Try to swap extents to it's original places
1055 */
1056 double_down_write_data_sem(orig_inode, donor_inode);
1057 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
1058 orig_blk_offset,
1059 block_len_in_page, &err2);
1060 double_up_write_data_sem(orig_inode, donor_inode);
1061 if (replaced_count != block_len_in_page) {
1062 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
1063 "Unable to copy data block,"
1064 " data will be lost.");
1065 *err = -EIO;
1066 }
1067 replaced_count = 0;
1068 goto unlock_pages;
930} 1069}
931 1070
932/** 1071/**
@@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode,
969 return -EINVAL; 1108 return -EINVAL;
970 } 1109 }
971 1110
972 /* Files should be in the same ext4 FS */
973 if (orig_inode->i_sb != donor_inode->i_sb) {
974 ext4_debug("ext4 move extent: The argument files "
975 "should be in same FS [ino:orig %lu, donor %lu]\n",
976 orig_inode->i_ino, donor_inode->i_ino);
977 return -EINVAL;
978 }
979
980 /* Ext4 move extent supports only extent based file */ 1111 /* Ext4 move extent supports only extent based file */
981 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { 1112 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
982 ext4_debug("ext4 move extent: orig file is not extents " 1113 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode,
1002 } 1133 }
1003 1134
1004 if ((orig_start >= EXT_MAX_BLOCKS) || 1135 if ((orig_start >= EXT_MAX_BLOCKS) ||
1005 (donor_start >= EXT_MAX_BLOCKS) ||
1006 (*len > EXT_MAX_BLOCKS) || 1136 (*len > EXT_MAX_BLOCKS) ||
1007 (orig_start + *len >= EXT_MAX_BLOCKS)) { 1137 (orig_start + *len >= EXT_MAX_BLOCKS)) {
1008 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 1138 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
@@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode,
1072 * @inode1: the inode structure 1202 * @inode1: the inode structure
1073 * @inode2: the inode structure 1203 * @inode2: the inode structure
1074 * 1204 *
1075 * Lock two inodes' i_mutex by i_ino order. 1205 * Lock two inodes' i_mutex
1076 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1077 */ 1206 */
1078static int 1207static void
1079mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1208mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1080{ 1209{
1081 int ret = 0; 1210 BUG_ON(inode1 == inode2);
1082 1211 if (inode1 < inode2) {
1083 BUG_ON(inode1 == NULL && inode2 == NULL);
1084
1085 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1086 if (ret < 0)
1087 goto out;
1088
1089 if (inode1 == inode2) {
1090 mutex_lock(&inode1->i_mutex);
1091 goto out;
1092 }
1093
1094 if (inode1->i_ino < inode2->i_ino) {
1095 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1212 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1096 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1213 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1097 } else { 1214 } else {
1098 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1215 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1099 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1216 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1100 } 1217 }
1101
1102out:
1103 return ret;
1104} 1218}
1105 1219
1106/** 1220/**
@@ -1109,28 +1223,13 @@ out:
1109 * @inode1: the inode that is released first 1223 * @inode1: the inode that is released first
1110 * @inode2: the inode that is released second 1224 * @inode2: the inode that is released second
1111 * 1225 *
1112 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1113 */ 1226 */
1114 1227
1115static int 1228static void
1116mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1229mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1117{ 1230{
1118 int ret = 0; 1231 mutex_unlock(&inode1->i_mutex);
1119 1232 mutex_unlock(&inode2->i_mutex);
1120 BUG_ON(inode1 == NULL && inode2 == NULL);
1121
1122 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1123 if (ret < 0)
1124 goto out;
1125
1126 if (inode1)
1127 mutex_unlock(&inode1->i_mutex);
1128
1129 if (inode2 && inode2 != inode1)
1130 mutex_unlock(&inode2->i_mutex);
1131
1132out:
1133 return ret;
1134} 1233}
1135 1234
1136/** 1235/**
@@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1187 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1286 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1188 ext4_lblk_t rest_blocks; 1287 ext4_lblk_t rest_blocks;
1189 pgoff_t orig_page_offset = 0, seq_end_page; 1288 pgoff_t orig_page_offset = 0, seq_end_page;
1190 int ret1, ret2, depth, last_extent = 0; 1289 int ret, depth, last_extent = 0;
1191 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1290 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1192 int data_offset_in_page; 1291 int data_offset_in_page;
1193 int block_len_in_page; 1292 int block_len_in_page;
1194 int uninit; 1293 int uninit;
1195 1294
1196 /* orig and donor should be different file */ 1295 if (orig_inode->i_sb != donor_inode->i_sb) {
1197 if (orig_inode->i_ino == donor_inode->i_ino) { 1296 ext4_debug("ext4 move extent: The argument files "
1297 "should be in same FS [ino:orig %lu, donor %lu]\n",
1298 orig_inode->i_ino, donor_inode->i_ino);
1299 return -EINVAL;
1300 }
1301
1302 /* orig and donor should be different inodes */
1303 if (orig_inode == donor_inode) {
1198 ext4_debug("ext4 move extent: The argument files should not " 1304 ext4_debug("ext4 move extent: The argument files should not "
1199 "be same file [ino:orig %lu, donor %lu]\n", 1305 "be same inode [ino:orig %lu, donor %lu]\n",
1200 orig_inode->i_ino, donor_inode->i_ino); 1306 orig_inode->i_ino, donor_inode->i_ino);
1201 return -EINVAL; 1307 return -EINVAL;
1202 } 1308 }
@@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1208 orig_inode->i_ino, donor_inode->i_ino); 1314 orig_inode->i_ino, donor_inode->i_ino);
1209 return -EINVAL; 1315 return -EINVAL;
1210 } 1316 }
1211 1317 /* TODO: This is non obvious task to swap blocks for inodes with full
1318 jornaling enabled */
1319 if (ext4_should_journal_data(orig_inode) ||
1320 ext4_should_journal_data(donor_inode)) {
1321 return -EINVAL;
1322 }
1212 /* Protect orig and donor inodes against a truncate */ 1323 /* Protect orig and donor inodes against a truncate */
1213 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1324 mext_inode_double_lock(orig_inode, donor_inode);
1214 if (ret1 < 0) 1325
1215 return ret1; 1326 /* Wait for all existing dio workers */
1327 ext4_inode_block_unlocked_dio(orig_inode);
1328 ext4_inode_block_unlocked_dio(donor_inode);
1329 inode_dio_wait(orig_inode);
1330 inode_dio_wait(donor_inode);
1216 1331
1217 /* Protect extent tree against block allocations via delalloc */ 1332 /* Protect extent tree against block allocations via delalloc */
1218 double_down_write_data_sem(orig_inode, donor_inode); 1333 double_down_write_data_sem(orig_inode, donor_inode);
1219 /* Check the filesystem environment whether move_extent can be done */ 1334 /* Check the filesystem environment whether move_extent can be done */
1220 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1335 ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
1221 donor_start, &len); 1336 donor_start, &len);
1222 if (ret1) 1337 if (ret)
1223 goto out; 1338 goto out;
1224 1339
1225 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1340 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1227 if (file_end < block_end) 1342 if (file_end < block_end)
1228 len -= block_end - file_end; 1343 len -= block_end - file_end;
1229 1344
1230 ret1 = get_ext_path(orig_inode, block_start, &orig_path); 1345 ret = get_ext_path(orig_inode, block_start, &orig_path);
1231 if (ret1) 1346 if (ret)
1232 goto out; 1347 goto out;
1233 1348
1234 /* Get path structure to check the hole */ 1349 /* Get path structure to check the hole */
1235 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); 1350 ret = get_ext_path(orig_inode, block_start, &holecheck_path);
1236 if (ret1) 1351 if (ret)
1237 goto out; 1352 goto out;
1238 1353
1239 depth = ext_depth(orig_inode); 1354 depth = ext_depth(orig_inode);
@@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1252 last_extent = mext_next_extent(orig_inode, 1367 last_extent = mext_next_extent(orig_inode,
1253 holecheck_path, &ext_cur); 1368 holecheck_path, &ext_cur);
1254 if (last_extent < 0) { 1369 if (last_extent < 0) {
1255 ret1 = last_extent; 1370 ret = last_extent;
1256 goto out; 1371 goto out;
1257 } 1372 }
1258 last_extent = mext_next_extent(orig_inode, orig_path, 1373 last_extent = mext_next_extent(orig_inode, orig_path,
1259 &ext_dummy); 1374 &ext_dummy);
1260 if (last_extent < 0) { 1375 if (last_extent < 0) {
1261 ret1 = last_extent; 1376 ret = last_extent;
1262 goto out; 1377 goto out;
1263 } 1378 }
1264 seq_start = le32_to_cpu(ext_cur->ee_block); 1379 seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1272 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1387 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1273 ext4_debug("ext4 move extent: The specified range of file " 1388 ext4_debug("ext4 move extent: The specified range of file "
1274 "may be the hole\n"); 1389 "may be the hole\n");
1275 ret1 = -EINVAL; 1390 ret = -EINVAL;
1276 goto out; 1391 goto out;
1277 } 1392 }
1278 1393
@@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1292 last_extent = mext_next_extent(orig_inode, holecheck_path, 1407 last_extent = mext_next_extent(orig_inode, holecheck_path,
1293 &ext_cur); 1408 &ext_cur);
1294 if (last_extent < 0) { 1409 if (last_extent < 0) {
1295 ret1 = last_extent; 1410 ret = last_extent;
1296 break; 1411 break;
1297 } 1412 }
1298 add_blocks = ext4_ext_get_actual_len(ext_cur); 1413 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1349 orig_page_offset, 1464 orig_page_offset,
1350 data_offset_in_page, 1465 data_offset_in_page,
1351 block_len_in_page, uninit, 1466 block_len_in_page, uninit,
1352 &ret1); 1467 &ret);
1353 1468
1354 /* Count how many blocks we have exchanged */ 1469 /* Count how many blocks we have exchanged */
1355 *moved_len += block_len_in_page; 1470 *moved_len += block_len_in_page;
1356 if (ret1 < 0) 1471 if (ret < 0)
1357 break; 1472 break;
1358 if (*moved_len > len) { 1473 if (*moved_len > len) {
1359 EXT4_ERROR_INODE(orig_inode, 1474 EXT4_ERROR_INODE(orig_inode,
1360 "We replaced blocks too much! " 1475 "We replaced blocks too much! "
1361 "sum of replaced: %llu requested: %llu", 1476 "sum of replaced: %llu requested: %llu",
1362 *moved_len, len); 1477 *moved_len, len);
1363 ret1 = -EIO; 1478 ret = -EIO;
1364 break; 1479 break;
1365 } 1480 }
1366 1481
@@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1374 } 1489 }
1375 1490
1376 double_down_write_data_sem(orig_inode, donor_inode); 1491 double_down_write_data_sem(orig_inode, donor_inode);
1377 if (ret1 < 0) 1492 if (ret < 0)
1378 break; 1493 break;
1379 1494
1380 /* Decrease buffer counter */ 1495 /* Decrease buffer counter */
1381 if (holecheck_path) 1496 if (holecheck_path)
1382 ext4_ext_drop_refs(holecheck_path); 1497 ext4_ext_drop_refs(holecheck_path);
1383 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); 1498 ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
1384 if (ret1) 1499 if (ret)
1385 break; 1500 break;
1386 depth = holecheck_path->p_depth; 1501 depth = holecheck_path->p_depth;
1387 1502
1388 /* Decrease buffer counter */ 1503 /* Decrease buffer counter */
1389 if (orig_path) 1504 if (orig_path)
1390 ext4_ext_drop_refs(orig_path); 1505 ext4_ext_drop_refs(orig_path);
1391 ret1 = get_ext_path(orig_inode, seq_start, &orig_path); 1506 ret = get_ext_path(orig_inode, seq_start, &orig_path);
1392 if (ret1) 1507 if (ret)
1393 break; 1508 break;
1394 1509
1395 ext_cur = holecheck_path[depth].p_ext; 1510 ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1527,9 @@ out:
1412 kfree(holecheck_path); 1527 kfree(holecheck_path);
1413 } 1528 }
1414 double_up_write_data_sem(orig_inode, donor_inode); 1529 double_up_write_data_sem(orig_inode, donor_inode);
1415 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1530 ext4_inode_resume_unlocked_dio(orig_inode);
1416 1531 ext4_inode_resume_unlocked_dio(donor_inode);
1417 if (ret1) 1532 mext_inode_double_unlock(orig_inode, donor_inode);
1418 return ret1;
1419 else if (ret2)
1420 return ret2;
1421 1533
1422 return 0; 1534 return ret;
1423} 1535}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a42cc04466f..6d600a69fc9d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
55{ 55{
56 struct buffer_head *bh; 56 struct buffer_head *bh;
57 57
58 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
59 ((inode->i_size >> 10) >=
60 EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
61 *err = -ENOSPC;
62 return NULL;
63 }
64
58 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 65 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
59 66
60 bh = ext4_bread(handle, inode, *block, 1, err); 67 bh = ext4_bread(handle, inode, *block, 1, err);
@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
67 bh = NULL; 74 bh = NULL;
68 } 75 }
69 } 76 }
77 if (!bh && !(*err)) {
78 *err = -EIO;
79 ext4_error(inode->i_sb,
80 "Directory hole detected on inode %lu\n",
81 inode->i_ino);
82 }
70 return bh; 83 return bh;
71} 84}
72 85
@@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
594 u32 hash; 607 u32 hash;
595 608
596 frame->bh = NULL; 609 frame->bh = NULL;
597 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 610 if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
611 if (*err == 0)
612 *err = ERR_BAD_DX_DIR;
598 goto fail; 613 goto fail;
614 }
599 root = (struct dx_root *) bh->b_data; 615 root = (struct dx_root *) bh->b_data;
600 if (root->info.hash_version != DX_HASH_TEA && 616 if (root->info.hash_version != DX_HASH_TEA &&
601 root->info.hash_version != DX_HASH_HALF_MD4 && 617 root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
696 frame->entries = entries; 712 frame->entries = entries;
697 frame->at = at; 713 frame->at = at;
698 if (!indirect--) return frame; 714 if (!indirect--) return frame;
699 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) 715 if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
716 if (!(*err))
717 *err = ERR_BAD_DX_DIR;
700 goto fail2; 718 goto fail2;
719 }
701 at = entries = ((struct dx_node *) bh->b_data)->entries; 720 at = entries = ((struct dx_node *) bh->b_data)->entries;
702 721
703 if (!buffer_verified(bh) && 722 if (!buffer_verified(bh) &&
@@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
807 */ 826 */
808 while (num_frames--) { 827 while (num_frames--) {
809 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 828 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
810 0, &err))) 829 0, &err))) {
830 if (!err) {
831 ext4_error(dir->i_sb,
832 "Directory hole detected on inode %lu\n",
833 dir->i_ino);
834 return -EIO;
835 }
811 return err; /* Failure */ 836 return err; /* Failure */
837 }
812 838
813 if (!buffer_verified(bh) && 839 if (!buffer_verified(bh) &&
814 !ext4_dx_csum_verify(dir, 840 !ext4_dx_csum_verify(dir,
@@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
839{ 865{
840 struct buffer_head *bh; 866 struct buffer_head *bh;
841 struct ext4_dir_entry_2 *de, *top; 867 struct ext4_dir_entry_2 *de, *top;
842 int err, count = 0; 868 int err = 0, count = 0;
843 869
844 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 870 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
845 (unsigned long)block)); 871 (unsigned long)block));
846 if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 872 if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
873 if (!err) {
874 err = -EIO;
875 ext4_error(dir->i_sb,
876 "Directory hole detected on inode %lu\n",
877 dir->i_ino);
878 }
847 return err; 879 return err;
880 }
848 881
849 if (!buffer_verified(bh) && 882 if (!buffer_verified(bh) &&
850 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) 883 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1267 return NULL; 1300 return NULL;
1268 do { 1301 do {
1269 block = dx_get_block(frame->at); 1302 block = dx_get_block(frame->at);
1270 if (!(bh = ext4_bread(NULL, dir, block, 0, err))) 1303 if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
1304 if (!(*err)) {
1305 *err = -EIO;
1306 ext4_error(dir->i_sb,
1307 "Directory hole detected on inode %lu\n",
1308 dir->i_ino);
1309 }
1271 goto errout; 1310 goto errout;
1311 }
1272 1312
1273 if (!buffer_verified(bh) && 1313 if (!buffer_verified(bh) &&
1274 !ext4_dirent_csum_verify(dir, 1314 !ext4_dirent_csum_verify(dir,
@@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1801 } 1841 }
1802 blocks = dir->i_size >> sb->s_blocksize_bits; 1842 blocks = dir->i_size >> sb->s_blocksize_bits;
1803 for (block = 0; block < blocks; block++) { 1843 for (block = 0; block < blocks; block++) {
1804 bh = ext4_bread(handle, dir, block, 0, &retval); 1844 if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
1805 if(!bh) 1845 if (!retval) {
1846 retval = -EIO;
1847 ext4_error(inode->i_sb,
1848 "Directory hole detected on inode %lu\n",
1849 inode->i_ino);
1850 }
1806 return retval; 1851 return retval;
1852 }
1807 if (!buffer_verified(bh) && 1853 if (!buffer_verified(bh) &&
1808 !ext4_dirent_csum_verify(dir, 1854 !ext4_dirent_csum_verify(dir,
1809 (struct ext4_dir_entry *)bh->b_data)) 1855 (struct ext4_dir_entry *)bh->b_data))
@@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1860 entries = frame->entries; 1906 entries = frame->entries;
1861 at = frame->at; 1907 at = frame->at;
1862 1908
1863 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) 1909 if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
1910 if (!err) {
1911 err = -EIO;
1912 ext4_error(dir->i_sb,
1913 "Directory hole detected on inode %lu\n",
1914 dir->i_ino);
1915 }
1864 goto cleanup; 1916 goto cleanup;
1917 }
1865 1918
1866 if (!buffer_verified(bh) && 1919 if (!buffer_verified(bh) &&
1867 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) 1920 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -2149,9 +2202,7 @@ retry:
2149 err = PTR_ERR(inode); 2202 err = PTR_ERR(inode);
2150 if (!IS_ERR(inode)) { 2203 if (!IS_ERR(inode)) {
2151 init_special_inode(inode, inode->i_mode, rdev); 2204 init_special_inode(inode, inode->i_mode, rdev);
2152#ifdef CONFIG_EXT4_FS_XATTR
2153 inode->i_op = &ext4_special_inode_operations; 2205 inode->i_op = &ext4_special_inode_operations;
2154#endif
2155 err = ext4_add_nondir(handle, dentry, inode); 2206 err = ext4_add_nondir(handle, dentry, inode);
2156 } 2207 }
2157 ext4_journal_stop(handle); 2208 ext4_journal_stop(handle);
@@ -2199,9 +2250,15 @@ retry:
2199 inode->i_op = &ext4_dir_inode_operations; 2250 inode->i_op = &ext4_dir_inode_operations;
2200 inode->i_fop = &ext4_dir_operations; 2251 inode->i_fop = &ext4_dir_operations;
2201 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2252 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2202 dir_block = ext4_bread(handle, inode, 0, 1, &err); 2253 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2203 if (!dir_block) 2254 if (!err) {
2255 err = -EIO;
2256 ext4_error(inode->i_sb,
2257 "Directory hole detected on inode %lu\n",
2258 inode->i_ino);
2259 }
2204 goto out_clear_inode; 2260 goto out_clear_inode;
2261 }
2205 BUFFER_TRACE(dir_block, "get_write_access"); 2262 BUFFER_TRACE(dir_block, "get_write_access");
2206 err = ext4_journal_get_write_access(handle, dir_block); 2263 err = ext4_journal_get_write_access(handle, dir_block);
2207 if (err) 2264 if (err)
@@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode)
2318 EXT4_ERROR_INODE(inode, 2375 EXT4_ERROR_INODE(inode,
2319 "error %d reading directory " 2376 "error %d reading directory "
2320 "lblock %u", err, lblock); 2377 "lblock %u", err, lblock);
2378 else
2379 ext4_warning(inode->i_sb,
2380 "bad directory (dir #%lu) - no data block",
2381 inode->i_ino);
2382
2321 offset += sb->s_blocksize; 2383 offset += sb->s_blocksize;
2322 continue; 2384 continue;
2323 } 2385 }
@@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2362 struct ext4_iloc iloc; 2424 struct ext4_iloc iloc;
2363 int err = 0, rc; 2425 int err = 0, rc;
2364 2426
2365 if (!ext4_handle_valid(handle)) 2427 if (!EXT4_SB(sb)->s_journal)
2366 return 0; 2428 return 0;
2367 2429
2368 mutex_lock(&EXT4_SB(sb)->s_orphan_lock); 2430 mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
@@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2436 struct ext4_iloc iloc; 2498 struct ext4_iloc iloc;
2437 int err = 0; 2499 int err = 0;
2438 2500
2439 /* ext4_handle_valid() assumes a valid handle_t pointer */ 2501 if (!EXT4_SB(inode->i_sb)->s_journal)
2440 if (handle && !ext4_handle_valid(handle))
2441 return 0; 2502 return 0;
2442 2503
2443 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2504 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2456 * transaction handle with which to update the orphan list on 2517 * transaction handle with which to update the orphan list on
2457 * disk, but we still need to remove the inode from the linked 2518 * disk, but we still need to remove the inode from the linked
2458 * list in memory. */ 2519 * list in memory. */
2459 if (sbi->s_journal && !handle) 2520 if (!handle)
2460 goto out; 2521 goto out;
2461 2522
2462 err = ext4_reserve_inode_write(handle, inode, &iloc); 2523 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2826 goto end_rename; 2887 goto end_rename;
2827 } 2888 }
2828 retval = -EIO; 2889 retval = -EIO;
2829 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2890 if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
2830 if (!dir_bh) 2891 if (!retval) {
2892 retval = -EIO;
2893 ext4_error(old_inode->i_sb,
2894 "Directory hole detected on inode %lu\n",
2895 old_inode->i_ino);
2896 }
2831 goto end_rename; 2897 goto end_rename;
2898 }
2832 if (!buffer_verified(dir_bh) && 2899 if (!buffer_verified(dir_bh) &&
2833 !ext4_dirent_csum_verify(old_inode, 2900 !ext4_dirent_csum_verify(old_inode,
2834 (struct ext4_dir_entry *)dir_bh->b_data)) 2901 (struct ext4_dir_entry *)dir_bh->b_data))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef169a69..68e896e12a67 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
71 int i; 71 int i;
72 72
73 BUG_ON(!io); 73 BUG_ON(!io);
74 BUG_ON(!list_empty(&io->list));
75 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
76
74 if (io->page) 77 if (io->page)
75 put_page(io->page); 78 put_page(io->page);
76 for (i = 0; i < io->num_io_pages; i++) 79 for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
81 kmem_cache_free(io_end_cachep, io); 84 kmem_cache_free(io_end_cachep, io);
82} 85}
83 86
84/* 87/* check a range of space and convert unwritten extents to written. */
85 * check a range of space and convert unwritten extents to written. 88static int ext4_end_io(ext4_io_end_t *io)
86 *
87 * Called with inode->i_mutex; we depend on this when we manipulate
88 * io->flag, since we could otherwise race with ext4_flush_completed_IO()
89 */
90int ext4_end_io_nolock(ext4_io_end_t *io)
91{ 89{
92 struct inode *inode = io->inode; 90 struct inode *inode = io->inode;
93 loff_t offset = io->offset; 91 loff_t offset = io->offset;
@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
106 "(inode %lu, offset %llu, size %zd, error %d)", 104 "(inode %lu, offset %llu, size %zd, error %d)",
107 inode->i_ino, offset, size, ret); 105 inode->i_ino, offset, size, ret);
108 } 106 }
109
110 if (io->iocb) 107 if (io->iocb)
111 aio_complete(io->iocb, io->result, 0); 108 aio_complete(io->iocb, io->result, 0);
112 109
113 if (io->flag & EXT4_IO_END_DIRECT) 110 if (io->flag & EXT4_IO_END_DIRECT)
114 inode_dio_done(inode); 111 inode_dio_done(inode);
115 /* Wake up anyone waiting on unwritten extent conversion */ 112 /* Wake up anyone waiting on unwritten extent conversion */
116 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 113 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
117 wake_up_all(ext4_ioend_wq(io->inode)); 114 wake_up_all(ext4_ioend_wq(io->inode));
118 return ret; 115 return ret;
119} 116}
120 117
121/* 118static void dump_completed_IO(struct inode *inode)
122 * work on completed aio dio IO, to convert unwritten extents to extents 119{
123 */ 120#ifdef EXT4FS_DEBUG
124static void ext4_end_io_work(struct work_struct *work) 121 struct list_head *cur, *before, *after;
122 ext4_io_end_t *io, *io0, *io1;
123 unsigned long flags;
124
125 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
126 ext4_debug("inode %lu completed_io list is empty\n",
127 inode->i_ino);
128 return;
129 }
130
131 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
132 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
133 cur = &io->list;
134 before = cur->prev;
135 io0 = container_of(before, ext4_io_end_t, list);
136 after = cur->next;
137 io1 = container_of(after, ext4_io_end_t, list);
138
139 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
140 io, inode->i_ino, io0, io1);
141 }
142#endif
143}
144
145/* Add the io_end to per-inode completed end_io list. */
146void ext4_add_complete_io(ext4_io_end_t *io_end)
125{ 147{
126 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 148 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
127 struct inode *inode = io->inode; 149 struct workqueue_struct *wq;
128 struct ext4_inode_info *ei = EXT4_I(inode); 150 unsigned long flags;
129 unsigned long flags; 151
152 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
153 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
130 154
131 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 155 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
132 if (io->flag & EXT4_IO_END_IN_FSYNC) 156 if (list_empty(&ei->i_completed_io_list)) {
133 goto requeue; 157 io_end->flag |= EXT4_IO_END_QUEUED;
134 if (list_empty(&io->list)) { 158 queue_work(wq, &io_end->work);
135 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
136 goto free;
137 } 159 }
160 list_add_tail(&io_end->list, &ei->i_completed_io_list);
161 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
162}
138 163
139 if (!mutex_trylock(&inode->i_mutex)) { 164static int ext4_do_flush_completed_IO(struct inode *inode,
140 bool was_queued; 165 ext4_io_end_t *work_io)
141requeue: 166{
142 was_queued = !!(io->flag & EXT4_IO_END_QUEUED); 167 ext4_io_end_t *io;
143 io->flag |= EXT4_IO_END_QUEUED; 168 struct list_head unwritten, complete, to_free;
144 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 169 unsigned long flags;
145 /* 170 struct ext4_inode_info *ei = EXT4_I(inode);
146 * Requeue the work instead of waiting so that the work 171 int err, ret = 0;
147 * items queued after this can be processed. 172
148 */ 173 INIT_LIST_HEAD(&complete);
149 queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); 174 INIT_LIST_HEAD(&to_free);
150 /* 175
151 * To prevent the ext4-dio-unwritten thread from keeping 176 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
152 * requeueing end_io requests and occupying cpu for too long, 177 dump_completed_IO(inode);
153 * yield the cpu if it sees an end_io request that has already 178 list_replace_init(&ei->i_completed_io_list, &unwritten);
154 * been requeued. 179 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
155 */ 180
156 if (was_queued) 181 while (!list_empty(&unwritten)) {
157 yield(); 182 io = list_entry(unwritten.next, ext4_io_end_t, list);
158 return; 183 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
184 list_del_init(&io->list);
185
186 err = ext4_end_io(io);
187 if (unlikely(!ret && err))
188 ret = err;
189
190 list_add_tail(&io->list, &complete);
191 }
192 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
193 while (!list_empty(&complete)) {
194 io = list_entry(complete.next, ext4_io_end_t, list);
195 io->flag &= ~EXT4_IO_END_UNWRITTEN;
196 /* end_io context can not be destroyed now because it still
197 * used by queued worker. Worker thread will destroy it later */
198 if (io->flag & EXT4_IO_END_QUEUED)
199 list_del_init(&io->list);
200 else
201 list_move(&io->list, &to_free);
202 }
203 /* If we are called from worker context, it is time to clear queued
204 * flag, and destroy it's end_io if it was converted already */
205 if (work_io) {
206 work_io->flag &= ~EXT4_IO_END_QUEUED;
207 if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
208 list_add_tail(&work_io->list, &to_free);
159 } 209 }
160 list_del_init(&io->list);
161 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 210 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
162 (void) ext4_end_io_nolock(io); 211
163 mutex_unlock(&inode->i_mutex); 212 while (!list_empty(&to_free)) {
164free: 213 io = list_entry(to_free.next, ext4_io_end_t, list);
165 ext4_free_io_end(io); 214 list_del_init(&io->list);
215 ext4_free_io_end(io);
216 }
217 return ret;
218}
219
220/*
221 * work on completed aio dio IO, to convert unwritten extents to extents
222 */
223static void ext4_end_io_work(struct work_struct *work)
224{
225 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
226 ext4_do_flush_completed_IO(io->inode, io);
227}
228
229int ext4_flush_unwritten_io(struct inode *inode)
230{
231 int ret;
232 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
233 !(inode->i_state & I_FREEING));
234 ret = ext4_do_flush_completed_IO(inode, NULL);
235 ext4_unwritten_wait(inode);
236 return ret;
166} 237}
167 238
168ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 239ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
195static void ext4_end_bio(struct bio *bio, int error) 266static void ext4_end_bio(struct bio *bio, int error)
196{ 267{
197 ext4_io_end_t *io_end = bio->bi_private; 268 ext4_io_end_t *io_end = bio->bi_private;
198 struct workqueue_struct *wq;
199 struct inode *inode; 269 struct inode *inode;
200 unsigned long flags;
201 int i; 270 int i;
202 sector_t bi_sector = bio->bi_sector; 271 sector_t bi_sector = bio->bi_sector;
203 272
@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
255 return; 324 return;
256 } 325 }
257 326
258 /* Add the io_end to per-inode completed io list*/ 327 ext4_add_complete_io(io_end);
259 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
260 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
261 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
262
263 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
264 /* queue the work to convert unwritten extents to written */
265 queue_work(wq, &io_end->work);
266} 328}
267 329
268void ext4_io_submit(struct ext4_io_submit *io) 330void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 41f6ef68e2e1..7a75e1086961 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb)
45 smp_mb__after_clear_bit(); 45 smp_mb__after_clear_bit();
46} 46}
47 47
48static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
49 ext4_group_t group) {
50 return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
51 EXT4_DESC_PER_BLOCK_BITS(sb);
52}
53
54static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
55 ext4_group_t group) {
56 group = ext4_meta_bg_first_group(sb, group);
57 return ext4_group_first_block_no(sb, group);
58}
59
60static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
61 ext4_group_t group) {
62 ext4_grpblk_t overhead;
63 overhead = ext4_bg_num_gdb(sb, group);
64 if (ext4_bg_has_super(sb, group))
65 overhead += 1 +
66 le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
67 return overhead;
68}
69
48#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 70#define outside(b, first, last) ((b) < (first) || (b) >= (last))
49#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 71#define inside(b, first, last) ((b) >= (first) && (b) < (last))
50 72
@@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb,
57 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
58 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
59 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
60 unsigned overhead = ext4_bg_has_super(sb, group) ? 82 unsigned overhead = ext4_group_overhead_blocks(sb, group);
61 (1 + ext4_bg_num_gdb(sb, group) +
62 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
63 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend = start + overhead;
64 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
65 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
@@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
200 * be a partial of a flex group. 220 * be a partial of a flex group.
201 * 221 *
202 * @sb: super block of fs to which the groups belongs 222 * @sb: super block of fs to which the groups belongs
223 *
224 * Returns 0 on a successful allocation of the metadata blocks in the
225 * block group.
203 */ 226 */
204static void ext4_alloc_group_tables(struct super_block *sb, 227static int ext4_alloc_group_tables(struct super_block *sb,
205 struct ext4_new_flex_group_data *flex_gd, 228 struct ext4_new_flex_group_data *flex_gd,
206 int flexbg_size) 229 int flexbg_size)
207{ 230{
208 struct ext4_new_group_data *group_data = flex_gd->groups; 231 struct ext4_new_group_data *group_data = flex_gd->groups;
209 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
210 ext4_fsblk_t start_blk; 232 ext4_fsblk_t start_blk;
211 ext4_fsblk_t last_blk; 233 ext4_fsblk_t last_blk;
212 ext4_group_t src_group; 234 ext4_group_t src_group;
@@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb,
226 (last_group & ~(flexbg_size - 1)))); 248 (last_group & ~(flexbg_size - 1))));
227next_group: 249next_group:
228 group = group_data[0].group; 250 group = group_data[0].group;
251 if (src_group >= group_data[0].group + flex_gd->count)
252 return -ENOSPC;
229 start_blk = ext4_group_first_block_no(sb, src_group); 253 start_blk = ext4_group_first_block_no(sb, src_group);
230 last_blk = start_blk + group_data[src_group - group].blocks_count; 254 last_blk = start_blk + group_data[src_group - group].blocks_count;
231 255
232 overhead = ext4_bg_has_super(sb, src_group) ? 256 overhead = ext4_group_overhead_blocks(sb, src_group);
233 (1 + ext4_bg_num_gdb(sb, src_group) +
234 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
235 257
236 start_blk += overhead; 258 start_blk += overhead;
237 259
238 BUG_ON(src_group >= group_data[0].group + flex_gd->count);
239 /* We collect contiguous blocks as much as possible. */ 260 /* We collect contiguous blocks as much as possible. */
240 src_group++; 261 src_group++;
241 for (; src_group <= last_group; src_group++) 262 for (; src_group <= last_group; src_group++) {
242 if (!ext4_bg_has_super(sb, src_group)) 263 overhead = ext4_group_overhead_blocks(sb, src_group);
264 if (overhead != 0)
243 last_blk += group_data[src_group - group].blocks_count; 265 last_blk += group_data[src_group - group].blocks_count;
244 else 266 else
245 break; 267 break;
268 }
246 269
247 /* Allocate block bitmaps */ 270 /* Allocate block bitmaps */
248 for (; bb_index < flex_gd->count; bb_index++) { 271 for (; bb_index < flex_gd->count; bb_index++) {
@@ -300,6 +323,7 @@ next_group:
300 group_data[i].free_blocks_count); 323 group_data[i].free_blocks_count);
301 } 324 }
302 } 325 }
326 return 0;
303} 327}
304 328
305static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, 329static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
433 ext4_group_t group, count; 457 ext4_group_t group, count;
434 struct buffer_head *bh = NULL; 458 struct buffer_head *bh = NULL;
435 int reserved_gdb, i, j, err = 0, err2; 459 int reserved_gdb, i, j, err = 0, err2;
460 int meta_bg;
436 461
437 BUG_ON(!flex_gd->count || !group_data || 462 BUG_ON(!flex_gd->count || !group_data ||
438 group_data[0].group != sbi->s_groups_count); 463 group_data[0].group != sbi->s_groups_count);
439 464
440 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); 465 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
466 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
441 467
442 /* This transaction may be extended/restarted along the way */ 468 /* This transaction may be extended/restarted along the way */
443 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); 469 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
@@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
447 group = group_data[0].group; 473 group = group_data[0].group;
448 for (i = 0; i < flex_gd->count; i++, group++) { 474 for (i = 0; i < flex_gd->count; i++, group++) {
449 unsigned long gdblocks; 475 unsigned long gdblocks;
476 ext4_grpblk_t overhead;
450 477
451 gdblocks = ext4_bg_num_gdb(sb, group); 478 gdblocks = ext4_bg_num_gdb(sb, group);
452 start = ext4_group_first_block_no(sb, group); 479 start = ext4_group_first_block_no(sb, group);
453 480
481 if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
482 goto handle_itb;
483
484 if (meta_bg == 1) {
485 ext4_group_t first_group;
486 first_group = ext4_meta_bg_first_group(sb, group);
487 if (first_group != group + 1 &&
488 first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
489 goto handle_itb;
490 }
491
492 block = start + ext4_bg_has_super(sb, group);
454 /* Copy all of the GDT blocks into the backup in this group */ 493 /* Copy all of the GDT blocks into the backup in this group */
455 for (j = 0, block = start + 1; j < gdblocks; j++, block++) { 494 for (j = 0; j < gdblocks; j++, block++) {
456 struct buffer_head *gdb; 495 struct buffer_head *gdb;
457 496
458 ext4_debug("update backup group %#04llx\n", block); 497 ext4_debug("update backup group %#04llx\n", block);
@@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
493 goto out; 532 goto out;
494 } 533 }
495 534
535handle_itb:
496 /* Initialize group tables of the grop @group */ 536 /* Initialize group tables of the grop @group */
497 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) 537 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
498 goto handle_bb; 538 goto handle_bb;
@@ -521,11 +561,11 @@ handle_bb:
521 err = PTR_ERR(bh); 561 err = PTR_ERR(bh);
522 goto out; 562 goto out;
523 } 563 }
524 if (ext4_bg_has_super(sb, group)) { 564 overhead = ext4_group_overhead_blocks(sb, group);
565 if (overhead != 0) {
525 ext4_debug("mark backup superblock %#04llx (+0)\n", 566 ext4_debug("mark backup superblock %#04llx (+0)\n",
526 start); 567 start);
527 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 568 ext4_set_bits(bh->b_data, 0, overhead);
528 1);
529 } 569 }
530 ext4_mark_bitmap_end(group_data[i].blocks_count, 570 ext4_mark_bitmap_end(group_data[i].blocks_count,
531 sb->s_blocksize * 8, bh->b_data); 571 sb->s_blocksize * 8, bh->b_data);
@@ -822,6 +862,45 @@ exit_bh:
822} 862}
823 863
824/* 864/*
865 * add_new_gdb_meta_bg is the sister of add_new_gdb.
866 */
867static int add_new_gdb_meta_bg(struct super_block *sb,
868 handle_t *handle, ext4_group_t group) {
869 ext4_fsblk_t gdblock;
870 struct buffer_head *gdb_bh;
871 struct buffer_head **o_group_desc, **n_group_desc;
872 unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
873 int err;
874
875 gdblock = ext4_meta_bg_first_block_no(sb, group) +
876 ext4_bg_has_super(sb, group);
877 gdb_bh = sb_bread(sb, gdblock);
878 if (!gdb_bh)
879 return -EIO;
880 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
881 sizeof(struct buffer_head *),
882 GFP_NOFS);
883 if (!n_group_desc) {
884 err = -ENOMEM;
885 ext4_warning(sb, "not enough memory for %lu groups",
886 gdb_num + 1);
887 return err;
888 }
889
890 o_group_desc = EXT4_SB(sb)->s_group_desc;
891 memcpy(n_group_desc, o_group_desc,
892 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
893 n_group_desc[gdb_num] = gdb_bh;
894 EXT4_SB(sb)->s_group_desc = n_group_desc;
895 EXT4_SB(sb)->s_gdb_count++;
896 ext4_kvfree(o_group_desc);
897 err = ext4_journal_get_write_access(handle, gdb_bh);
898 if (unlikely(err))
899 brelse(gdb_bh);
900 return err;
901}
902
903/*
825 * Called when we are adding a new group which has a backup copy of each of 904 * Called when we are adding a new group which has a backup copy of each of
826 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. 905 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
827 * We need to add these reserved backup GDT blocks to the resize inode, so 906 * We need to add these reserved backup GDT blocks to the resize inode, so
@@ -949,16 +1028,16 @@ exit_free:
949 * do not copy the full number of backups at this time. The resize 1028 * do not copy the full number of backups at this time. The resize
950 * which changed s_groups_count will backup again. 1029 * which changed s_groups_count will backup again.
951 */ 1030 */
952static void update_backups(struct super_block *sb, 1031static void update_backups(struct super_block *sb, int blk_off, char *data,
953 int blk_off, char *data, int size) 1032 int size, int meta_bg)
954{ 1033{
955 struct ext4_sb_info *sbi = EXT4_SB(sb); 1034 struct ext4_sb_info *sbi = EXT4_SB(sb);
956 const ext4_group_t last = sbi->s_groups_count; 1035 ext4_group_t last;
957 const int bpg = EXT4_BLOCKS_PER_GROUP(sb); 1036 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
958 unsigned three = 1; 1037 unsigned three = 1;
959 unsigned five = 5; 1038 unsigned five = 5;
960 unsigned seven = 7; 1039 unsigned seven = 7;
961 ext4_group_t group; 1040 ext4_group_t group = 0;
962 int rest = sb->s_blocksize - size; 1041 int rest = sb->s_blocksize - size;
963 handle_t *handle; 1042 handle_t *handle;
964 int err = 0, err2; 1043 int err = 0, err2;
@@ -970,10 +1049,17 @@ static void update_backups(struct super_block *sb,
970 goto exit_err; 1049 goto exit_err;
971 } 1050 }
972 1051
973 ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); 1052 if (meta_bg == 0) {
1053 group = ext4_list_backups(sb, &three, &five, &seven);
1054 last = sbi->s_groups_count;
1055 } else {
1056 group = ext4_meta_bg_first_group(sb, group) + 1;
1057 last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
1058 }
974 1059
975 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { 1060 while (group < sbi->s_groups_count) {
976 struct buffer_head *bh; 1061 struct buffer_head *bh;
1062 ext4_fsblk_t backup_block;
977 1063
978 /* Out of journal space, and can't get more - abort - so sad */ 1064 /* Out of journal space, and can't get more - abort - so sad */
979 if (ext4_handle_valid(handle) && 1065 if (ext4_handle_valid(handle) &&
@@ -982,13 +1068,20 @@ static void update_backups(struct super_block *sb,
982 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 1068 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
983 break; 1069 break;
984 1070
985 bh = sb_getblk(sb, group * bpg + blk_off); 1071 if (meta_bg == 0)
1072 backup_block = group * bpg + blk_off;
1073 else
1074 backup_block = (ext4_group_first_block_no(sb, group) +
1075 ext4_bg_has_super(sb, group));
1076
1077 bh = sb_getblk(sb, backup_block);
986 if (!bh) { 1078 if (!bh) {
987 err = -EIO; 1079 err = -EIO;
988 break; 1080 break;
989 } 1081 }
990 ext4_debug("update metadata backup %#04lx\n", 1082 ext4_debug("update metadata backup %llu(+%llu)\n",
991 (unsigned long)bh->b_blocknr); 1083 backup_block, backup_block -
1084 ext4_group_first_block_no(sb, group));
992 if ((err = ext4_journal_get_write_access(handle, bh))) 1085 if ((err = ext4_journal_get_write_access(handle, bh)))
993 break; 1086 break;
994 lock_buffer(bh); 1087 lock_buffer(bh);
@@ -1001,6 +1094,13 @@ static void update_backups(struct super_block *sb,
1001 if (unlikely(err)) 1094 if (unlikely(err))
1002 ext4_std_error(sb, err); 1095 ext4_std_error(sb, err);
1003 brelse(bh); 1096 brelse(bh);
1097
1098 if (meta_bg == 0)
1099 group = ext4_list_backups(sb, &three, &five, &seven);
1100 else if (group == last)
1101 break;
1102 else
1103 group = last;
1004 } 1104 }
1005 if ((err2 = ext4_journal_stop(handle)) && !err) 1105 if ((err2 = ext4_journal_stop(handle)) && !err)
1006 err = err2; 1106 err = err2;
@@ -1043,7 +1143,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1043 struct ext4_super_block *es = sbi->s_es; 1143 struct ext4_super_block *es = sbi->s_es;
1044 struct buffer_head *gdb_bh; 1144 struct buffer_head *gdb_bh;
1045 int i, gdb_off, gdb_num, err = 0; 1145 int i, gdb_off, gdb_num, err = 0;
1146 int meta_bg;
1046 1147
1148 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1047 for (i = 0; i < count; i++, group++) { 1149 for (i = 0; i < count; i++, group++) {
1048 int reserved_gdb = ext4_bg_has_super(sb, group) ? 1150 int reserved_gdb = ext4_bg_has_super(sb, group) ?
1049 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1151 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1063,8 +1165,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1063 1165
1064 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) 1166 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
1065 err = reserve_backup_gdb(handle, resize_inode, group); 1167 err = reserve_backup_gdb(handle, resize_inode, group);
1066 } else 1168 } else if (meta_bg != 0) {
1169 err = add_new_gdb_meta_bg(sb, handle, group);
1170 } else {
1067 err = add_new_gdb(handle, resize_inode, group); 1171 err = add_new_gdb(handle, resize_inode, group);
1172 }
1068 if (err) 1173 if (err)
1069 break; 1174 break;
1070 } 1175 }
@@ -1076,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
1076 struct buffer_head *bh = sb_getblk(sb, block); 1181 struct buffer_head *bh = sb_getblk(sb, block);
1077 if (!bh) 1182 if (!bh)
1078 return NULL; 1183 return NULL;
1079 1184 if (!bh_uptodate_or_lock(bh)) {
1080 if (bitmap_uptodate(bh)) 1185 if (bh_submit_read(bh) < 0) {
1081 return bh; 1186 brelse(bh);
1082 1187 return NULL;
1083 lock_buffer(bh); 1188 }
1084 if (bh_submit_read(bh) < 0) {
1085 unlock_buffer(bh);
1086 brelse(bh);
1087 return NULL;
1088 } 1189 }
1089 unlock_buffer(bh);
1090 1190
1091 return bh; 1191 return bh;
1092} 1192}
@@ -1161,6 +1261,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
1161 ext4_free_group_clusters_set(sb, gdp, 1261 ext4_free_group_clusters_set(sb, gdp,
1162 EXT4_B2C(sbi, group_data->free_blocks_count)); 1262 EXT4_B2C(sbi, group_data->free_blocks_count));
1163 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 1263 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
1264 if (ext4_has_group_desc_csum(sb))
1265 ext4_itable_unused_set(sb, gdp,
1266 EXT4_INODES_PER_GROUP(sb));
1164 gdp->bg_flags = cpu_to_le16(*bg_flags); 1267 gdp->bg_flags = cpu_to_le16(*bg_flags);
1165 ext4_group_desc_csum_set(sb, group, gdp); 1268 ext4_group_desc_csum_set(sb, group, gdp);
1166 1269
@@ -1216,7 +1319,7 @@ static void ext4_update_super(struct super_block *sb,
1216 } 1319 }
1217 1320
1218 reserved_blocks = ext4_r_blocks_count(es) * 100; 1321 reserved_blocks = ext4_r_blocks_count(es) * 100;
1219 do_div(reserved_blocks, ext4_blocks_count(es)); 1322 reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
1220 reserved_blocks *= blocks_count; 1323 reserved_blocks *= blocks_count;
1221 do_div(reserved_blocks, 100); 1324 do_div(reserved_blocks, 100);
1222 1325
@@ -1227,6 +1330,7 @@ static void ext4_update_super(struct super_block *sb,
1227 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1330 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1228 flex_gd->count); 1331 flex_gd->count);
1229 1332
1333 ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
1230 /* 1334 /*
1231 * We need to protect s_groups_count against other CPUs seeing 1335 * We need to protect s_groups_count against other CPUs seeing
1232 * inconsistent state in the superblock. 1336 * inconsistent state in the superblock.
@@ -1261,6 +1365,8 @@ static void ext4_update_super(struct super_block *sb,
1261 percpu_counter_add(&sbi->s_freeinodes_counter, 1365 percpu_counter_add(&sbi->s_freeinodes_counter,
1262 EXT4_INODES_PER_GROUP(sb) * flex_gd->count); 1366 EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
1263 1367
1368 ext4_debug("free blocks count %llu",
1369 percpu_counter_read(&sbi->s_freeclusters_counter));
1264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 1370 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
1265 EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1371 EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
1266 sbi->s_log_groups_per_flex) { 1372 sbi->s_log_groups_per_flex) {
@@ -1349,16 +1455,24 @@ exit_journal:
1349 err = err2; 1455 err = err2;
1350 1456
1351 if (!err) { 1457 if (!err) {
1352 int i; 1458 int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
1459 int gdb_num_end = ((group + flex_gd->count - 1) /
1460 EXT4_DESC_PER_BLOCK(sb));
1461 int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
1462 EXT4_FEATURE_INCOMPAT_META_BG);
1463 sector_t old_gdb = 0;
1464
1353 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 1465 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
1354 sizeof(struct ext4_super_block)); 1466 sizeof(struct ext4_super_block), 0);
1355 for (i = 0; i < flex_gd->count; i++, group++) { 1467 for (; gdb_num <= gdb_num_end; gdb_num++) {
1356 struct buffer_head *gdb_bh; 1468 struct buffer_head *gdb_bh;
1357 int gdb_num; 1469
1358 gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
1359 gdb_bh = sbi->s_group_desc[gdb_num]; 1470 gdb_bh = sbi->s_group_desc[gdb_num];
1471 if (old_gdb == gdb_bh->b_blocknr)
1472 continue;
1360 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, 1473 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
1361 gdb_bh->b_size); 1474 gdb_bh->b_size, meta_bg);
1475 old_gdb = gdb_bh->b_blocknr;
1362 } 1476 }
1363 } 1477 }
1364exit: 1478exit:
@@ -1402,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
1402 1516
1403 group_data[i].group = group + i; 1517 group_data[i].group = group + i;
1404 group_data[i].blocks_count = blocks_per_group; 1518 group_data[i].blocks_count = blocks_per_group;
1405 overhead = ext4_bg_has_super(sb, group + i) ? 1519 overhead = ext4_group_overhead_blocks(sb, group + i);
1406 (1 + ext4_bg_num_gdb(sb, group + i) +
1407 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
1408 group_data[i].free_blocks_count = blocks_per_group - overhead; 1520 group_data[i].free_blocks_count = blocks_per_group - overhead;
1409 if (ext4_has_group_desc_csum(sb)) 1521 if (ext4_has_group_desc_csum(sb))
1410 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | 1522 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
@@ -1492,6 +1604,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1492 if (err) 1604 if (err)
1493 goto out; 1605 goto out;
1494 1606
1607 err = ext4_alloc_flex_bg_array(sb, input->group + 1);
1608 if (err)
1609 return err;
1610
1611 err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
1612 if (err)
1613 goto out;
1614
1495 flex_gd.count = 1; 1615 flex_gd.count = 1;
1496 flex_gd.groups = input; 1616 flex_gd.groups = input;
1497 flex_gd.bg_flags = &bg_flags; 1617 flex_gd.bg_flags = &bg_flags;
@@ -1544,11 +1664,13 @@ errout:
1544 err = err2; 1664 err = err2;
1545 1665
1546 if (!err) { 1666 if (!err) {
1667 ext4_fsblk_t first_block;
1668 first_block = ext4_group_first_block_no(sb, 0);
1547 if (test_opt(sb, DEBUG)) 1669 if (test_opt(sb, DEBUG))
1548 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1670 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1549 "blocks\n", ext4_blocks_count(es)); 1671 "blocks\n", ext4_blocks_count(es));
1550 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, 1672 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
1551 sizeof(struct ext4_super_block)); 1673 (char *)es, sizeof(struct ext4_super_block), 0);
1552 } 1674 }
1553 return err; 1675 return err;
1554} 1676}
@@ -1631,6 +1753,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1631 return err; 1753 return err;
1632} /* ext4_group_extend */ 1754} /* ext4_group_extend */
1633 1755
1756
1757static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
1758{
1759 return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
1760}
1761
1762/*
1763 * Release the resize inode and drop the resize_inode feature if there
1764 * are no more reserved gdt blocks, and then convert the file system
1765 * to enable meta_bg
1766 */
1767static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
1768{
1769 handle_t *handle;
1770 struct ext4_sb_info *sbi = EXT4_SB(sb);
1771 struct ext4_super_block *es = sbi->s_es;
1772 struct ext4_inode_info *ei = EXT4_I(inode);
1773 ext4_fsblk_t nr;
1774 int i, ret, err = 0;
1775 int credits = 1;
1776
1777 ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
1778 if (inode) {
1779 if (es->s_reserved_gdt_blocks) {
1780 ext4_error(sb, "Unexpected non-zero "
1781 "s_reserved_gdt_blocks");
1782 return -EPERM;
1783 }
1784
1785 /* Do a quick sanity check of the resize inode */
1786 if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
1787 goto invalid_resize_inode;
1788 for (i = 0; i < EXT4_N_BLOCKS; i++) {
1789 if (i == EXT4_DIND_BLOCK) {
1790 if (ei->i_data[i])
1791 continue;
1792 else
1793 goto invalid_resize_inode;
1794 }
1795 if (ei->i_data[i])
1796 goto invalid_resize_inode;
1797 }
1798 credits += 3; /* block bitmap, bg descriptor, resize inode */
1799 }
1800
1801 handle = ext4_journal_start_sb(sb, credits);
1802 if (IS_ERR(handle))
1803 return PTR_ERR(handle);
1804
1805 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1806 if (err)
1807 goto errout;
1808
1809 EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
1810 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1811 sbi->s_es->s_first_meta_bg =
1812 cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
1813
1814 err = ext4_handle_dirty_super(handle, sb);
1815 if (err) {
1816 ext4_std_error(sb, err);
1817 goto errout;
1818 }
1819
1820 if (inode) {
1821 nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
1822 ext4_free_blocks(handle, inode, NULL, nr, 1,
1823 EXT4_FREE_BLOCKS_METADATA |
1824 EXT4_FREE_BLOCKS_FORGET);
1825 ei->i_data[EXT4_DIND_BLOCK] = 0;
1826 inode->i_blocks = 0;
1827
1828 err = ext4_mark_inode_dirty(handle, inode);
1829 if (err)
1830 ext4_std_error(sb, err);
1831 }
1832
1833errout:
1834 ret = ext4_journal_stop(handle);
1835 if (!err)
1836 err = ret;
1837 return ret;
1838
1839invalid_resize_inode:
1840 ext4_error(sb, "corrupted/inconsistent resize inode");
1841 return -EINVAL;
1842}
1843
1634/* 1844/*
1635 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count 1845 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
1636 * 1846 *
@@ -1643,21 +1853,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1643 struct ext4_sb_info *sbi = EXT4_SB(sb); 1853 struct ext4_sb_info *sbi = EXT4_SB(sb);
1644 struct ext4_super_block *es = sbi->s_es; 1854 struct ext4_super_block *es = sbi->s_es;
1645 struct buffer_head *bh; 1855 struct buffer_head *bh;
1646 struct inode *resize_inode; 1856 struct inode *resize_inode = NULL;
1647 ext4_fsblk_t o_blocks_count; 1857 ext4_grpblk_t add, offset;
1648 ext4_group_t o_group;
1649 ext4_group_t n_group;
1650 ext4_grpblk_t offset, add;
1651 unsigned long n_desc_blocks; 1858 unsigned long n_desc_blocks;
1652 unsigned long o_desc_blocks; 1859 unsigned long o_desc_blocks;
1653 unsigned long desc_blocks; 1860 ext4_group_t o_group;
1654 int err = 0, flexbg_size = 1; 1861 ext4_group_t n_group;
1862 ext4_fsblk_t o_blocks_count;
1863 ext4_fsblk_t n_blocks_count_retry = 0;
1864 unsigned long last_update_time = 0;
1865 int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
1866 int meta_bg;
1655 1867
1868 /* See if the device is actually as big as what was requested */
1869 bh = sb_bread(sb, n_blocks_count - 1);
1870 if (!bh) {
1871 ext4_warning(sb, "can't read last block, resize aborted");
1872 return -ENOSPC;
1873 }
1874 brelse(bh);
1875
1876retry:
1656 o_blocks_count = ext4_blocks_count(es); 1877 o_blocks_count = ext4_blocks_count(es);
1657 1878
1658 if (test_opt(sb, DEBUG)) 1879 ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
1659 ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " 1880 "to %llu blocks", o_blocks_count, n_blocks_count);
1660 "to %llu blocks", o_blocks_count, n_blocks_count);
1661 1881
1662 if (n_blocks_count < o_blocks_count) { 1882 if (n_blocks_count < o_blocks_count) {
1663 /* On-line shrinking not supported */ 1883 /* On-line shrinking not supported */
@@ -1672,32 +1892,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1672 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1892 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1673 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1893 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1674 1894
1675 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1895 n_desc_blocks = num_desc_blocks(sb, n_group + 1);
1676 EXT4_DESC_PER_BLOCK(sb); 1896 o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
1677 o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1678 EXT4_DESC_PER_BLOCK(sb);
1679 desc_blocks = n_desc_blocks - o_desc_blocks;
1680 1897
1681 if (desc_blocks && 1898 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1682 (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
1683 le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
1684 ext4_warning(sb, "No reserved GDT blocks, can't resize");
1685 return -EPERM;
1686 }
1687 1899
1688 resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); 1900 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
1689 if (IS_ERR(resize_inode)) { 1901 if (meta_bg) {
1690 ext4_warning(sb, "Error opening resize inode"); 1902 ext4_error(sb, "resize_inode and meta_bg enabled "
1691 return PTR_ERR(resize_inode); 1903 "simultaneously");
1904 return -EINVAL;
1905 }
1906 if (n_desc_blocks > o_desc_blocks +
1907 le16_to_cpu(es->s_reserved_gdt_blocks)) {
1908 n_blocks_count_retry = n_blocks_count;
1909 n_desc_blocks = o_desc_blocks +
1910 le16_to_cpu(es->s_reserved_gdt_blocks);
1911 n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
1912 n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
1913 n_group--; /* set to last group number */
1914 }
1915
1916 if (!resize_inode)
1917 resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
1918 if (IS_ERR(resize_inode)) {
1919 ext4_warning(sb, "Error opening resize inode");
1920 return PTR_ERR(resize_inode);
1921 }
1692 } 1922 }
1693 1923
1694 /* See if the device is actually as big as what was requested */ 1924 if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
1695 bh = sb_bread(sb, n_blocks_count - 1); 1925 err = ext4_convert_meta_bg(sb, resize_inode);
1696 if (!bh) { 1926 if (err)
1697 ext4_warning(sb, "can't read last block, resize aborted"); 1927 goto out;
1698 return -ENOSPC; 1928 if (resize_inode) {
1929 iput(resize_inode);
1930 resize_inode = NULL;
1931 }
1932 if (n_blocks_count_retry) {
1933 n_blocks_count = n_blocks_count_retry;
1934 n_blocks_count_retry = 0;
1935 goto retry;
1936 }
1699 } 1937 }
1700 brelse(bh);
1701 1938
1702 /* extend the last group */ 1939 /* extend the last group */
1703 if (n_group == o_group) 1940 if (n_group == o_group)
@@ -1710,12 +1947,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1710 goto out; 1947 goto out;
1711 } 1948 }
1712 1949
1713 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1950 if (ext4_blocks_count(es) == n_blocks_count)
1714 es->s_log_groups_per_flex) 1951 goto out;
1715 flexbg_size = 1 << es->s_log_groups_per_flex;
1716 1952
1717 o_blocks_count = ext4_blocks_count(es); 1953 err = ext4_alloc_flex_bg_array(sb, n_group + 1);
1718 if (o_blocks_count == n_blocks_count) 1954 if (err)
1955 return err;
1956
1957 err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
1958 if (err)
1719 goto out; 1959 goto out;
1720 1960
1721 flex_gd = alloc_flex_gd(flexbg_size); 1961 flex_gd = alloc_flex_gd(flexbg_size);
@@ -1729,19 +1969,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1729 */ 1969 */
1730 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, 1970 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
1731 flexbg_size)) { 1971 flexbg_size)) {
1732 ext4_alloc_group_tables(sb, flex_gd, flexbg_size); 1972 if (jiffies - last_update_time > HZ * 10) {
1973 if (last_update_time)
1974 ext4_msg(sb, KERN_INFO,
1975 "resized to %llu blocks",
1976 ext4_blocks_count(es));
1977 last_update_time = jiffies;
1978 }
1979 if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
1980 break;
1733 err = ext4_flex_group_add(sb, resize_inode, flex_gd); 1981 err = ext4_flex_group_add(sb, resize_inode, flex_gd);
1734 if (unlikely(err)) 1982 if (unlikely(err))
1735 break; 1983 break;
1736 } 1984 }
1737 1985
1986 if (!err && n_blocks_count_retry) {
1987 n_blocks_count = n_blocks_count_retry;
1988 n_blocks_count_retry = 0;
1989 free_flex_gd(flex_gd);
1990 flex_gd = NULL;
1991 goto retry;
1992 }
1993
1738out: 1994out:
1739 if (flex_gd) 1995 if (flex_gd)
1740 free_flex_gd(flex_gd); 1996 free_flex_gd(flex_gd);
1741 1997 if (resize_inode != NULL)
1742 iput(resize_inode); 1998 iput(resize_inode);
1743 if (test_opt(sb, DEBUG)) 1999 ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
1744 ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
1745 "upto %llu blocks", o_blocks_count, n_blocks_count);
1746 return err; 2000 return err;
1747} 2001}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c6e0cb3d1f4a..7265a0367476 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
420 */ 420 */
421 if (!es->s_error_count) 421 if (!es->s_error_count)
422 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); 422 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
423 es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); 423 le32_add_cpu(&es->s_error_count, 1);
424} 424}
425 425
426static void save_error_info(struct super_block *sb, const char *func, 426static void save_error_info(struct super_block *sb, const char *func,
@@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb)
850 flush_workqueue(sbi->dio_unwritten_wq); 850 flush_workqueue(sbi->dio_unwritten_wq);
851 destroy_workqueue(sbi->dio_unwritten_wq); 851 destroy_workqueue(sbi->dio_unwritten_wq);
852 852
853 lock_super(sb);
854 if (sbi->s_journal) { 853 if (sbi->s_journal) {
855 err = jbd2_journal_destroy(sbi->s_journal); 854 err = jbd2_journal_destroy(sbi->s_journal);
856 sbi->s_journal = NULL; 855 sbi->s_journal = NULL;
@@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb)
917 * Now that we are completely done shutting down the 916 * Now that we are completely done shutting down the
918 * superblock, we need to actually destroy the kobject. 917 * superblock, we need to actually destroy the kobject.
919 */ 918 */
920 unlock_super(sb);
921 kobject_put(&sbi->s_kobj); 919 kobject_put(&sbi->s_kobj);
922 wait_for_completion(&sbi->s_kobj_unregister); 920 wait_for_completion(&sbi->s_kobj_unregister);
923 if (sbi->s_chksum_driver) 921 if (sbi->s_chksum_driver)
@@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
956 ei->jinode = NULL; 954 ei->jinode = NULL;
957 INIT_LIST_HEAD(&ei->i_completed_io_list); 955 INIT_LIST_HEAD(&ei->i_completed_io_list);
958 spin_lock_init(&ei->i_completed_io_lock); 956 spin_lock_init(&ei->i_completed_io_lock);
959 ei->cur_aio_dio = NULL;
960 ei->i_sync_tid = 0; 957 ei->i_sync_tid = 0;
961 ei->i_datasync_tid = 0; 958 ei->i_datasync_tid = 0;
962 atomic_set(&ei->i_ioend_count, 0); 959 atomic_set(&ei->i_ioend_count, 0);
963 atomic_set(&ei->i_aiodio_unwritten, 0); 960 atomic_set(&ei->i_unwritten, 0);
964 961
965 return &ei->vfs_inode; 962 return &ei->vfs_inode;
966} 963}
@@ -1019,6 +1016,11 @@ static int init_inodecache(void)
1019 1016
1020static void destroy_inodecache(void) 1017static void destroy_inodecache(void)
1021{ 1018{
1019 /*
1020 * Make sure all delayed rcu free inodes are flushed before we
1021 * destroy cache.
1022 */
1023 rcu_barrier();
1022 kmem_cache_destroy(ext4_inode_cachep); 1024 kmem_cache_destroy(ext4_inode_cachep);
1023} 1025}
1024 1026
@@ -1219,6 +1221,7 @@ enum {
1219 Opt_inode_readahead_blks, Opt_journal_ioprio, 1221 Opt_inode_readahead_blks, Opt_journal_ioprio,
1220 Opt_dioread_nolock, Opt_dioread_lock, 1222 Opt_dioread_nolock, Opt_dioread_lock,
1221 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1223 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1224 Opt_max_dir_size_kb,
1222}; 1225};
1223 1226
1224static const match_table_t tokens = { 1227static const match_table_t tokens = {
@@ -1292,6 +1295,7 @@ static const match_table_t tokens = {
1292 {Opt_init_itable, "init_itable=%u"}, 1295 {Opt_init_itable, "init_itable=%u"},
1293 {Opt_init_itable, "init_itable"}, 1296 {Opt_init_itable, "init_itable"},
1294 {Opt_noinit_itable, "noinit_itable"}, 1297 {Opt_noinit_itable, "noinit_itable"},
1298 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1295 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1299 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1296 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1300 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1297 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 1301 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1472,6 +1476,7 @@ static const struct mount_opts {
1472 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, 1476 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1473 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 1477 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1474 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 1478 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1479 {Opt_max_dir_size_kb, 0, MOPT_GTE0},
1475 {Opt_err, 0, 0} 1480 {Opt_err, 0, 0}
1476}; 1481};
1477 1482
@@ -1587,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1587 if (!args->from) 1592 if (!args->from)
1588 arg = EXT4_DEF_LI_WAIT_MULT; 1593 arg = EXT4_DEF_LI_WAIT_MULT;
1589 sbi->s_li_wait_mult = arg; 1594 sbi->s_li_wait_mult = arg;
1595 } else if (token == Opt_max_dir_size_kb) {
1596 sbi->s_max_dir_size_kb = arg;
1590 } else if (token == Opt_stripe) { 1597 } else if (token == Opt_stripe) {
1591 sbi->s_stripe = arg; 1598 sbi->s_stripe = arg;
1592 } else if (m->flags & MOPT_DATAJ) { 1599 } else if (m->flags & MOPT_DATAJ) {
@@ -1659,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb,
1659 * Initialize args struct so we know whether arg was 1666 * Initialize args struct so we know whether arg was
1660 * found; some options take optional arguments. 1667 * found; some options take optional arguments.
1661 */ 1668 */
1662 args[0].to = args[0].from = 0; 1669 args[0].to = args[0].from = NULL;
1663 token = match_token(p, tokens, args); 1670 token = match_token(p, tokens, args);
1664 if (handle_mount_opt(sb, p, token, args, journal_devnum, 1671 if (handle_mount_opt(sb, p, token, args, journal_devnum,
1665 journal_ioprio, is_remount) < 0) 1672 journal_ioprio, is_remount) < 0)
@@ -1735,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1735 1742
1736static const char *token2str(int token) 1743static const char *token2str(int token)
1737{ 1744{
1738 static const struct match_token *t; 1745 const struct match_token *t;
1739 1746
1740 for (t = tokens; t->token != Opt_err; t++) 1747 for (t = tokens; t->token != Opt_err; t++)
1741 if (t->token == token && !strchr(t->pattern, '=')) 1748 if (t->token == token && !strchr(t->pattern, '='))
@@ -1818,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1818 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && 1825 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1819 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) 1826 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1820 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 1827 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1828 if (nodefs || sbi->s_max_dir_size_kb)
1829 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1821 1830
1822 ext4_show_quota_options(seq, sb); 1831 ext4_show_quota_options(seq, sb);
1823 return 0; 1832 return 0;
@@ -1909,15 +1918,45 @@ done:
1909 return res; 1918 return res;
1910} 1919}
1911 1920
1921int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1922{
1923 struct ext4_sb_info *sbi = EXT4_SB(sb);
1924 struct flex_groups *new_groups;
1925 int size;
1926
1927 if (!sbi->s_log_groups_per_flex)
1928 return 0;
1929
1930 size = ext4_flex_group(sbi, ngroup - 1) + 1;
1931 if (size <= sbi->s_flex_groups_allocated)
1932 return 0;
1933
1934 size = roundup_pow_of_two(size * sizeof(struct flex_groups));
1935 new_groups = ext4_kvzalloc(size, GFP_KERNEL);
1936 if (!new_groups) {
1937 ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
1938 size / (int) sizeof(struct flex_groups));
1939 return -ENOMEM;
1940 }
1941
1942 if (sbi->s_flex_groups) {
1943 memcpy(new_groups, sbi->s_flex_groups,
1944 (sbi->s_flex_groups_allocated *
1945 sizeof(struct flex_groups)));
1946 ext4_kvfree(sbi->s_flex_groups);
1947 }
1948 sbi->s_flex_groups = new_groups;
1949 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
1950 return 0;
1951}
1952
1912static int ext4_fill_flex_info(struct super_block *sb) 1953static int ext4_fill_flex_info(struct super_block *sb)
1913{ 1954{
1914 struct ext4_sb_info *sbi = EXT4_SB(sb); 1955 struct ext4_sb_info *sbi = EXT4_SB(sb);
1915 struct ext4_group_desc *gdp = NULL; 1956 struct ext4_group_desc *gdp = NULL;
1916 ext4_group_t flex_group_count;
1917 ext4_group_t flex_group; 1957 ext4_group_t flex_group;
1918 unsigned int groups_per_flex = 0; 1958 unsigned int groups_per_flex = 0;
1919 size_t size; 1959 int i, err;
1920 int i;
1921 1960
1922 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1961 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1923 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { 1962 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
@@ -1926,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
1926 } 1965 }
1927 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1966 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1928 1967
1929 /* We allocate both existing and potentially added groups */ 1968 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1930 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1969 if (err)
1931 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1932 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1933 size = flex_group_count * sizeof(struct flex_groups);
1934 sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
1935 if (sbi->s_flex_groups == NULL) {
1936 ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
1937 flex_group_count);
1938 goto failed; 1970 goto failed;
1939 }
1940 1971
1941 for (i = 0; i < sbi->s_groups_count; i++) { 1972 for (i = 0; i < sbi->s_groups_count; i++) {
1942 gdp = ext4_get_group_desc(sb, i, NULL); 1973 gdp = ext4_get_group_desc(sb, i, NULL);
@@ -2139,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2139 } 2170 }
2140 2171
2141 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2172 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2142 if (es->s_last_orphan) 2173 /* don't clear list on RO mount w/ errors */
2174 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
2143 jbd_debug(1, "Errors on filesystem, " 2175 jbd_debug(1, "Errors on filesystem, "
2144 "clearing orphan list.\n"); 2176 "clearing orphan list.\n");
2145 es->s_last_orphan = 0; 2177 es->s_last_orphan = 0;
2178 }
2146 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 2179 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2147 return; 2180 return;
2148 } 2181 }
@@ -2523,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2523EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2556EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2524EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2557EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2525EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2558EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2559EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2526EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2560EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2527 2561
2528static struct attribute *ext4_attrs[] = { 2562static struct attribute *ext4_attrs[] = {
@@ -2538,6 +2572,7 @@ static struct attribute *ext4_attrs[] = {
2538 ATTR_LIST(mb_stream_req), 2572 ATTR_LIST(mb_stream_req),
2539 ATTR_LIST(mb_group_prealloc), 2573 ATTR_LIST(mb_group_prealloc),
2540 ATTR_LIST(max_writeback_mb_bump), 2574 ATTR_LIST(max_writeback_mb_bump),
2575 ATTR_LIST(extent_max_zeroout_kb),
2541 ATTR_LIST(trigger_fs_error), 2576 ATTR_LIST(trigger_fs_error),
2542 NULL, 2577 NULL,
2543}; 2578};
@@ -2545,10 +2580,12 @@ static struct attribute *ext4_attrs[] = {
2545/* Features this copy of ext4 supports */ 2580/* Features this copy of ext4 supports */
2546EXT4_INFO_ATTR(lazy_itable_init); 2581EXT4_INFO_ATTR(lazy_itable_init);
2547EXT4_INFO_ATTR(batched_discard); 2582EXT4_INFO_ATTR(batched_discard);
2583EXT4_INFO_ATTR(meta_bg_resize);
2548 2584
2549static struct attribute *ext4_feat_attrs[] = { 2585static struct attribute *ext4_feat_attrs[] = {
2550 ATTR_LIST(lazy_itable_init), 2586 ATTR_LIST(lazy_itable_init),
2551 ATTR_LIST(batched_discard), 2587 ATTR_LIST(batched_discard),
2588 ATTR_LIST(meta_bg_resize),
2552 NULL, 2589 NULL,
2553}; 2590};
2554 2591
@@ -3369,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3369 * enable delayed allocation by default 3406 * enable delayed allocation by default
3370 * Use -o nodelalloc to turn it off 3407 * Use -o nodelalloc to turn it off
3371 */ 3408 */
3372 if (!IS_EXT3_SB(sb) && 3409 if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3373 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3410 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3374 set_opt(sb, DELALLOC); 3411 set_opt(sb, DELALLOC);
3375 3412
@@ -3738,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3738 3775
3739 sbi->s_stripe = ext4_get_stripe_size(sbi); 3776 sbi->s_stripe = ext4_get_stripe_size(sbi);
3740 sbi->s_max_writeback_mb_bump = 128; 3777 sbi->s_max_writeback_mb_bump = 128;
3778 sbi->s_extent_max_zeroout_kb = 32;
3741 3779
3742 /* 3780 /*
3743 * set up enough so that it can read an inode 3781 * set up enough so that it can read an inode
@@ -4514,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb)
4514 if (sb->s_flags & MS_RDONLY) 4552 if (sb->s_flags & MS_RDONLY)
4515 return 0; 4553 return 0;
4516 4554
4517 lock_super(sb);
4518 /* Reset the needs_recovery flag before the fs is unlocked. */ 4555 /* Reset the needs_recovery flag before the fs is unlocked. */
4519 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4556 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4520 ext4_commit_super(sb, 1); 4557 ext4_commit_super(sb, 1);
4521 unlock_super(sb);
4522 return 0; 4558 return 0;
4523} 4559}
4524 4560
@@ -4554,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4554 char *orig_data = kstrdup(data, GFP_KERNEL); 4590 char *orig_data = kstrdup(data, GFP_KERNEL);
4555 4591
4556 /* Store the original options */ 4592 /* Store the original options */
4557 lock_super(sb);
4558 old_sb_flags = sb->s_flags; 4593 old_sb_flags = sb->s_flags;
4559 old_opts.s_mount_opt = sbi->s_mount_opt; 4594 old_opts.s_mount_opt = sbi->s_mount_opt;
4560 old_opts.s_mount_opt2 = sbi->s_mount_opt2; 4595 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
@@ -4696,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4696 if (sbi->s_journal == NULL) 4731 if (sbi->s_journal == NULL)
4697 ext4_commit_super(sb, 1); 4732 ext4_commit_super(sb, 1);
4698 4733
4699 unlock_super(sb);
4700#ifdef CONFIG_QUOTA 4734#ifdef CONFIG_QUOTA
4701 /* Release old quota file names */ 4735 /* Release old quota file names */
4702 for (i = 0; i < MAXQUOTAS; i++) 4736 for (i = 0; i < MAXQUOTAS; i++)
@@ -4709,10 +4743,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4709 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4743 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4710 EXT4_FEATURE_RO_COMPAT_QUOTA)) { 4744 EXT4_FEATURE_RO_COMPAT_QUOTA)) {
4711 err = ext4_enable_quotas(sb); 4745 err = ext4_enable_quotas(sb);
4712 if (err) { 4746 if (err)
4713 lock_super(sb);
4714 goto restore_opts; 4747 goto restore_opts;
4715 }
4716 } 4748 }
4717 } 4749 }
4718#endif 4750#endif
@@ -4739,7 +4771,6 @@ restore_opts:
4739 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 4771 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
4740 } 4772 }
4741#endif 4773#endif
4742 unlock_super(sb);
4743 kfree(orig_data); 4774 kfree(orig_data);
4744 return err; 4775 return err;
4745} 4776}
@@ -4791,7 +4822,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4791 4822
4792static inline struct inode *dquot_to_inode(struct dquot *dquot) 4823static inline struct inode *dquot_to_inode(struct dquot *dquot)
4793{ 4824{
4794 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; 4825 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
4795} 4826}
4796 4827
4797static int ext4_write_dquot(struct dquot *dquot) 4828static int ext4_write_dquot(struct dquot *dquot)
@@ -5264,8 +5295,10 @@ static int __init ext4_init_fs(void)
5264 if (err) 5295 if (err)
5265 goto out6; 5296 goto out6;
5266 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 5297 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
5267 if (!ext4_kset) 5298 if (!ext4_kset) {
5299 err = -ENOMEM;
5268 goto out5; 5300 goto out5;
5301 }
5269 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 5302 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
5270 5303
5271 err = ext4_init_feat_adverts(); 5304 err = ext4_init_feat_adverts();
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index e06190322c1c..964b634f6667 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -6,6 +6,6 @@ obj-$(CONFIG_FAT_FS) += fat.o
6obj-$(CONFIG_VFAT_FS) += vfat.o 6obj-$(CONFIG_VFAT_FS) += vfat.o
7obj-$(CONFIG_MSDOS_FS) += msdos.o 7obj-$(CONFIG_MSDOS_FS) += msdos.o
8 8
9fat-y := cache.o dir.o fatent.o file.o inode.o misc.o 9fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o
10vfat-y := namei_vfat.o 10vfat-y := namei_vfat.o
11msdos-y := namei_msdos.o 11msdos-y := namei_msdos.o
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 1cc7038e273d..91ad9e1c9441 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -190,7 +190,8 @@ static void __fat_cache_inval_inode(struct inode *inode)
190 struct fat_cache *cache; 190 struct fat_cache *cache;
191 191
192 while (!list_empty(&i->cache_lru)) { 192 while (!list_empty(&i->cache_lru)) {
193 cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list); 193 cache = list_entry(i->cache_lru.next,
194 struct fat_cache, cache_list);
194 list_del_init(&cache->cache_list); 195 list_del_init(&cache->cache_list);
195 i->nr_caches--; 196 i->nr_caches--;
196 fat_cache_free(cache); 197 fat_cache_free(cache);
@@ -261,9 +262,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
261 if (nr < 0) 262 if (nr < 0)
262 goto out; 263 goto out;
263 else if (nr == FAT_ENT_FREE) { 264 else if (nr == FAT_ENT_FREE) {
264 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" 265 fat_fs_error_ratelimit(sb,
265 " (i_pos %lld)", __func__, 266 "%s: invalid cluster chain (i_pos %lld)",
266 MSDOS_I(inode)->i_pos); 267 __func__,
268 MSDOS_I(inode)->i_pos);
267 nr = -EIO; 269 nr = -EIO;
268 goto out; 270 goto out;
269 } else if (nr == FAT_ENT_EOF) { 271 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index dc49ed2cbffa..2a182342442e 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -18,7 +18,7 @@
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include "fat.h" 23#include "fat.h"
24 24
@@ -123,7 +123,8 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
123{ 123{
124 /* Fast stuff first */ 124 /* Fast stuff first */
125 if (*bh && *de && 125 if (*bh && *de &&
126 (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) { 126 (*de - (struct msdos_dir_entry *)(*bh)->b_data) <
127 MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
127 *pos += sizeof(struct msdos_dir_entry); 128 *pos += sizeof(struct msdos_dir_entry);
128 (*de)++; 129 (*de)++;
129 return 0; 130 return 0;
@@ -155,7 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
155 156
156 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 157 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
157 ec = *ip++; 158 ec = *ip++;
158 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 159 charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE);
160 if (charlen > 0) {
159 op += charlen; 161 op += charlen;
160 len -= charlen; 162 len -= charlen;
161 } else { 163 } else {
@@ -172,12 +174,12 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
172 } 174 }
173 175
174 if (unlikely(*ip)) { 176 if (unlikely(*ip)) {
175 fat_msg(sb, KERN_WARNING, "filename was truncated while " 177 fat_msg(sb, KERN_WARNING,
176 "converting."); 178 "filename was truncated while converting.");
177 } 179 }
178 180
179 *op = 0; 181 *op = 0;
180 return (op - ascii); 182 return op - ascii;
181} 183}
182 184
183static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, 185static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
@@ -205,7 +207,8 @@ fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
205} 207}
206 208
207static inline int 209static inline int
208fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) 210fat_short2lower_uni(struct nls_table *t, unsigned char *c,
211 int clen, wchar_t *uni)
209{ 212{
210 int charlen; 213 int charlen;
211 wchar_t wc; 214 wchar_t wc;
@@ -220,7 +223,8 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *un
220 if (!nc) 223 if (!nc)
221 nc = *c; 224 nc = *c;
222 225
223 if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) { 226 charlen = t->char2uni(&nc, 1, uni);
227 if (charlen < 0) {
224 *uni = 0x003f; /* a question mark */ 228 *uni = 0x003f; /* a question mark */
225 charlen = 1; 229 charlen = 1;
226 } 230 }
@@ -537,7 +541,6 @@ end_of_dir:
537 541
538 return err; 542 return err;
539} 543}
540
541EXPORT_SYMBOL_GPL(fat_search_long); 544EXPORT_SYMBOL_GPL(fat_search_long);
542 545
543struct fat_ioctl_filldir_callback { 546struct fat_ioctl_filldir_callback {
@@ -568,13 +571,14 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
568 int short_len = 0, fill_len = 0; 571 int short_len = 0, fill_len = 0;
569 int ret = 0; 572 int ret = 0;
570 573
571 lock_super(sb); 574 mutex_lock(&sbi->s_lock);
572 575
573 cpos = filp->f_pos; 576 cpos = filp->f_pos;
574 /* Fake . and .. for the root directory. */ 577 /* Fake . and .. for the root directory. */
575 if (inode->i_ino == MSDOS_ROOT_INO) { 578 if (inode->i_ino == MSDOS_ROOT_INO) {
576 while (cpos < 2) { 579 while (cpos < 2) {
577 if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0) 580 if (filldir(dirent, "..", cpos+1, cpos,
581 MSDOS_ROOT_INO, DT_DIR) < 0)
578 goto out; 582 goto out;
579 cpos++; 583 cpos++;
580 filp->f_pos++; 584 filp->f_pos++;
@@ -689,7 +693,7 @@ fill_failed:
689 if (unicode) 693 if (unicode)
690 __putname(unicode); 694 __putname(unicode);
691out: 695out:
692 unlock_super(sb); 696 mutex_unlock(&sbi->s_lock);
693 return ret; 697 return ret;
694} 698}
695 699
@@ -872,25 +876,26 @@ static int fat_get_short_entry(struct inode *dir, loff_t *pos,
872} 876}
873 877
874/* 878/*
875 * The ".." entry can not provide the "struct fat_slot_info" informations 879 * The ".." entry can not provide the "struct fat_slot_info" information
876 * for inode. So, this function provide the some informations only. 880 * for inode, nor a usable i_pos. So, this function provides some information
881 * only.
882 *
883 * Since this function walks through the on-disk inodes within a directory,
884 * callers are responsible for taking any locks necessary to prevent the
885 * directory from changing.
877 */ 886 */
878int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, 887int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
879 struct msdos_dir_entry **de, loff_t *i_pos) 888 struct msdos_dir_entry **de)
880{ 889{
881 loff_t offset; 890 loff_t offset = 0;
882 891
883 offset = 0; 892 *de = NULL;
884 *bh = NULL;
885 while (fat_get_short_entry(dir, &offset, bh, de) >= 0) { 893 while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
886 if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) { 894 if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
887 *i_pos = fat_make_i_pos(dir->i_sb, *bh, *de);
888 return 0; 895 return 0;
889 }
890 } 896 }
891 return -ENOENT; 897 return -ENOENT;
892} 898}
893
894EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); 899EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
895 900
896/* See if directory is empty */ 901/* See if directory is empty */
@@ -913,7 +918,6 @@ int fat_dir_empty(struct inode *dir)
913 brelse(bh); 918 brelse(bh);
914 return result; 919 return result;
915} 920}
916
917EXPORT_SYMBOL_GPL(fat_dir_empty); 921EXPORT_SYMBOL_GPL(fat_dir_empty);
918 922
919/* 923/*
@@ -959,7 +963,6 @@ int fat_scan(struct inode *dir, const unsigned char *name,
959 } 963 }
960 return -ENOENT; 964 return -ENOENT;
961} 965}
962
963EXPORT_SYMBOL_GPL(fat_scan); 966EXPORT_SYMBOL_GPL(fat_scan);
964 967
965static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) 968static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
@@ -1047,7 +1050,6 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
1047 1050
1048 return 0; 1051 return 0;
1049} 1052}
1050
1051EXPORT_SYMBOL_GPL(fat_remove_entries); 1053EXPORT_SYMBOL_GPL(fat_remove_entries);
1052 1054
1053static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, 1055static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
@@ -1141,10 +1143,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
1141 de[0].ctime_cs = de[1].ctime_cs = 0; 1143 de[0].ctime_cs = de[1].ctime_cs = 0;
1142 de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0; 1144 de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
1143 } 1145 }
1144 de[0].start = cpu_to_le16(cluster); 1146 fat_set_start(&de[0], cluster);
1145 de[0].starthi = cpu_to_le16(cluster >> 16); 1147 fat_set_start(&de[1], MSDOS_I(dir)->i_logstart);
1146 de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart);
1147 de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16);
1148 de[0].size = de[1].size = 0; 1148 de[0].size = de[1].size = 0;
1149 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); 1149 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
1150 set_buffer_uptodate(bhs[0]); 1150 set_buffer_uptodate(bhs[0]);
@@ -1161,7 +1161,6 @@ error_free:
1161error: 1161error:
1162 return err; 1162 return err;
1163} 1163}
1164
1165EXPORT_SYMBOL_GPL(fat_alloc_new_dir); 1164EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
1166 1165
1167static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, 1166static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
@@ -1377,5 +1376,4 @@ error_remove:
1377 __fat_remove_entries(dir, pos, free_slots); 1376 __fat_remove_entries(dir, pos, free_slots);
1378 return err; 1377 return err;
1379} 1378}
1380
1381EXPORT_SYMBOL_GPL(fat_add_entries); 1379EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 2deeeb86f331..623f36f0423b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -5,6 +5,7 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/hash.h>
8#include <linux/mutex.h> 9#include <linux/mutex.h>
9#include <linux/ratelimit.h> 10#include <linux/ratelimit.h>
10#include <linux/msdos_fs.h> 11#include <linux/msdos_fs.h>
@@ -23,30 +24,31 @@
23#define FAT_ERRORS_RO 3 /* remount r/o on error */ 24#define FAT_ERRORS_RO 3 /* remount r/o on error */
24 25
25struct fat_mount_options { 26struct fat_mount_options {
26 uid_t fs_uid; 27 kuid_t fs_uid;
27 gid_t fs_gid; 28 kgid_t fs_gid;
28 unsigned short fs_fmask; 29 unsigned short fs_fmask;
29 unsigned short fs_dmask; 30 unsigned short fs_dmask;
30 unsigned short codepage; /* Codepage for shortname conversions */ 31 unsigned short codepage; /* Codepage for shortname conversions */
31 char *iocharset; /* Charset used for filename input/display */ 32 char *iocharset; /* Charset used for filename input/display */
32 unsigned short shortname; /* flags for shortname display/create rule */ 33 unsigned short shortname; /* flags for shortname display/create rule */
33 unsigned char name_check; /* r = relaxed, n = normal, s = strict */ 34 unsigned char name_check; /* r = relaxed, n = normal, s = strict */
34 unsigned char errors; /* On error: continue, panic, remount-ro */ 35 unsigned char errors; /* On error: continue, panic, remount-ro */
35 unsigned short allow_utime;/* permission for setting the [am]time */ 36 unsigned short allow_utime;/* permission for setting the [am]time */
36 unsigned quiet:1, /* set = fake successful chmods and chowns */ 37 unsigned quiet:1, /* set = fake successful chmods and chowns */
37 showexec:1, /* set = only set x bit for com/exe/bat */ 38 showexec:1, /* set = only set x bit for com/exe/bat */
38 sys_immutable:1, /* set = system files are immutable */ 39 sys_immutable:1, /* set = system files are immutable */
39 dotsOK:1, /* set = hidden and system files are named '.filename' */ 40 dotsOK:1, /* set = hidden and system files are named '.filename' */
40 isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ 41 isvfat:1, /* 0=no vfat long filename support, 1=vfat support */
41 utf8:1, /* Use of UTF-8 character set (Default) */ 42 utf8:1, /* Use of UTF-8 character set (Default) */
42 unicode_xlate:1, /* create escape sequences for unhandled Unicode */ 43 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
43 numtail:1, /* Does first alias have a numeric '~1' type tail? */ 44 numtail:1, /* Does first alias have a numeric '~1' type tail? */
44 flush:1, /* write things quickly */ 45 flush:1, /* write things quickly */
45 nocase:1, /* Does this need case conversion? 0=need case conversion*/ 46 nocase:1, /* Does this need case conversion? 0=need case conversion*/
46 usefree:1, /* Use free_clusters for FAT32 */ 47 usefree:1, /* Use free_clusters for FAT32 */
47 tz_utc:1, /* Filesystem timestamps are in UTC */ 48 tz_utc:1, /* Filesystem timestamps are in UTC */
48 rodir:1, /* allow ATTR_RO for directory */ 49 rodir:1, /* allow ATTR_RO for directory */
49 discard:1; /* Issue discard requests on deletions */ 50 discard:1, /* Issue discard requests on deletions */
51 nfs:1; /* Do extra work needed for NFS export */
50}; 52};
51 53
52#define FAT_HASH_BITS 8 54#define FAT_HASH_BITS 8
@@ -56,28 +58,29 @@ struct fat_mount_options {
56 * MS-DOS file system in-core superblock data 58 * MS-DOS file system in-core superblock data
57 */ 59 */
58struct msdos_sb_info { 60struct msdos_sb_info {
59 unsigned short sec_per_clus; /* sectors/cluster */ 61 unsigned short sec_per_clus; /* sectors/cluster */
60 unsigned short cluster_bits; /* log2(cluster_size) */ 62 unsigned short cluster_bits; /* log2(cluster_size) */
61 unsigned int cluster_size; /* cluster size */ 63 unsigned int cluster_size; /* cluster size */
62 unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */ 64 unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */
63 unsigned short fat_start; 65 unsigned short fat_start;
64 unsigned long fat_length; /* FAT start & length (sec.) */ 66 unsigned long fat_length; /* FAT start & length (sec.) */
65 unsigned long dir_start; 67 unsigned long dir_start;
66 unsigned short dir_entries; /* root dir start & entries */ 68 unsigned short dir_entries; /* root dir start & entries */
67 unsigned long data_start; /* first data sector */ 69 unsigned long data_start; /* first data sector */
68 unsigned long max_cluster; /* maximum cluster number */ 70 unsigned long max_cluster; /* maximum cluster number */
69 unsigned long root_cluster; /* first cluster of the root directory */ 71 unsigned long root_cluster; /* first cluster of the root directory */
70 unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ 72 unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
71 struct mutex fat_lock; 73 struct mutex fat_lock;
74 struct mutex s_lock;
72 unsigned int prev_free; /* previously allocated cluster number */ 75 unsigned int prev_free; /* previously allocated cluster number */
73 unsigned int free_clusters; /* -1 if undefined */ 76 unsigned int free_clusters; /* -1 if undefined */
74 unsigned int free_clus_valid; /* is free_clusters valid? */ 77 unsigned int free_clus_valid; /* is free_clusters valid? */
75 struct fat_mount_options options; 78 struct fat_mount_options options;
76 struct nls_table *nls_disk; /* Codepage used on disk */ 79 struct nls_table *nls_disk; /* Codepage used on disk */
77 struct nls_table *nls_io; /* Charset used for input and display */ 80 struct nls_table *nls_io; /* Charset used for input and display */
78 const void *dir_ops; /* Opaque; default directory operations */ 81 const void *dir_ops; /* Opaque; default directory operations */
79 int dir_per_block; /* dir entries per block */ 82 int dir_per_block; /* dir entries per block */
80 int dir_per_block_bits; /* log2(dir_per_block) */ 83 int dir_per_block_bits; /* log2(dir_per_block) */
81 84
82 int fatent_shift; 85 int fatent_shift;
83 struct fatent_operations *fatent_ops; 86 struct fatent_operations *fatent_ops;
@@ -88,6 +91,9 @@ struct msdos_sb_info {
88 91
89 spinlock_t inode_hash_lock; 92 spinlock_t inode_hash_lock;
90 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 93 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
94
95 spinlock_t dir_hash_lock;
96 struct hlist_head dir_hashtable[FAT_HASH_SIZE];
91}; 97};
92 98
93#define FAT_CACHE_VALID 0 /* special case for valid cache */ 99#define FAT_CACHE_VALID 0 /* special case for valid cache */
@@ -110,6 +116,7 @@ struct msdos_inode_info {
110 int i_attrs; /* unused attribute bits */ 116 int i_attrs; /* unused attribute bits */
111 loff_t i_pos; /* on-disk position of directory entry or 0 */ 117 loff_t i_pos; /* on-disk position of directory entry or 0 */
112 struct hlist_node i_fat_hash; /* hash by i_location */ 118 struct hlist_node i_fat_hash; /* hash by i_location */
119 struct hlist_node i_dir_hash; /* hash by i_logstart */
113 struct rw_semaphore truncate_lock; /* protect bmap against truncate */ 120 struct rw_semaphore truncate_lock; /* protect bmap against truncate */
114 struct inode vfs_inode; 121 struct inode vfs_inode;
115}; 122};
@@ -262,7 +269,7 @@ extern int fat_subdirs(struct inode *dir);
262extern int fat_scan(struct inode *dir, const unsigned char *name, 269extern int fat_scan(struct inode *dir, const unsigned char *name,
263 struct fat_slot_info *sinfo); 270 struct fat_slot_info *sinfo);
264extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, 271extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
265 struct msdos_dir_entry **de, loff_t *i_pos); 272 struct msdos_dir_entry **de);
266extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); 273extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
267extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, 274extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
268 struct fat_slot_info *sinfo); 275 struct fat_slot_info *sinfo);
@@ -322,7 +329,7 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
322 unsigned long arg); 329 unsigned long arg);
323extern const struct file_operations fat_file_operations; 330extern const struct file_operations fat_file_operations;
324extern const struct inode_operations fat_file_inode_operations; 331extern const struct inode_operations fat_file_inode_operations;
325extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 332extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
326extern void fat_truncate_blocks(struct inode *inode, loff_t offset); 333extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
327extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 334extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
328 struct kstat *stat); 335 struct kstat *stat);
@@ -340,7 +347,12 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
340 int isvfat, void (*setup)(struct super_block *)); 347 int isvfat, void (*setup)(struct super_block *));
341 348
342extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 349extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
343 struct inode *i2); 350 struct inode *i2);
351static inline unsigned long fat_dir_hash(int logstart)
352{
353 return hash_32(logstart, FAT_HASH_BITS);
354}
355
344/* fat/misc.c */ 356/* fat/misc.c */
345extern __printf(3, 4) __cold 357extern __printf(3, 4) __cold
346void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); 358void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
@@ -366,6 +378,14 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
366int fat_cache_init(void); 378int fat_cache_init(void);
367void fat_cache_destroy(void); 379void fat_cache_destroy(void);
368 380
381/* fat/nfs.c */
382struct fid;
383extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
384 int fh_len, int fh_type);
385extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
386 int fh_len, int fh_type);
387extern struct dentry *fat_get_parent(struct dentry *child_dir);
388
369/* helper for printk */ 389/* helper for printk */
370typedef unsigned long long llu; 390typedef unsigned long long llu;
371 391
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 31f08ab62c56..260705c58062 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -186,9 +186,6 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
186 186
187static void fat32_ent_put(struct fat_entry *fatent, int new) 187static void fat32_ent_put(struct fat_entry *fatent, int new)
188{ 188{
189 if (new == FAT_ENT_EOF)
190 new = EOF_FAT32;
191
192 WARN_ON(new & 0xf0000000); 189 WARN_ON(new & 0xf0000000);
193 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; 190 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
194 *fatent->u.ent32_p = cpu_to_le32(new); 191 *fatent->u.ent32_p = cpu_to_le32(new);
@@ -203,15 +200,18 @@ static int fat12_ent_next(struct fat_entry *fatent)
203 200
204 fatent->entry++; 201 fatent->entry++;
205 if (fatent->nr_bhs == 1) { 202 if (fatent->nr_bhs == 1) {
206 WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); 203 WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data +
207 WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); 204 (bhs[0]->b_size - 2)));
205 WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data +
206 (bhs[0]->b_size - 1)));
208 if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { 207 if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
209 ent12_p[0] = nextp - 1; 208 ent12_p[0] = nextp - 1;
210 ent12_p[1] = nextp; 209 ent12_p[1] = nextp;
211 return 1; 210 return 1;
212 } 211 }
213 } else { 212 } else {
214 WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); 213 WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data +
214 (bhs[0]->b_size - 1)));
215 WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); 215 WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
216 ent12_p[0] = nextp - 1; 216 ent12_p[0] = nextp - 1;
217 ent12_p[1] = nextp; 217 ent12_p[1] = nextp;
@@ -631,7 +631,6 @@ error:
631 631
632 return err; 632 return err;
633} 633}
634
635EXPORT_SYMBOL_GPL(fat_free_clusters); 634EXPORT_SYMBOL_GPL(fat_free_clusters);
636 635
637/* 128kb is the whole sectors for FAT12 and FAT16 */ 636/* 128kb is the whole sectors for FAT12 and FAT16 */
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e007b8bd8e5e..a62e0ecbe2db 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -352,7 +352,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
352{ 352{
353 umode_t allow_utime = sbi->options.allow_utime; 353 umode_t allow_utime = sbi->options.allow_utime;
354 354
355 if (current_fsuid() != inode->i_uid) { 355 if (!uid_eq(current_fsuid(), inode->i_uid)) {
356 if (in_group_p(inode->i_gid)) 356 if (in_group_p(inode->i_gid))
357 allow_utime >>= 3; 357 allow_utime >>= 3;
358 if (allow_utime & MAY_WRITE) 358 if (allow_utime & MAY_WRITE)
@@ -407,9 +407,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
407 } 407 }
408 408
409 if (((attr->ia_valid & ATTR_UID) && 409 if (((attr->ia_valid & ATTR_UID) &&
410 (attr->ia_uid != sbi->options.fs_uid)) || 410 (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
411 ((attr->ia_valid & ATTR_GID) && 411 ((attr->ia_valid & ATTR_GID) &&
412 (attr->ia_gid != sbi->options.fs_gid)) || 412 (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
413 ((attr->ia_valid & ATTR_MODE) && 413 ((attr->ia_valid & ATTR_MODE) &&
414 (attr->ia_mode & ~FAT_VALID_MODE))) 414 (attr->ia_mode & ~FAT_VALID_MODE)))
415 error = -EPERM; 415 error = -EPERM;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 05e897fe9866..5bafaad00530 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -281,15 +281,42 @@ static inline unsigned long fat_hash(loff_t i_pos)
281 return hash_32(i_pos, FAT_HASH_BITS); 281 return hash_32(i_pos, FAT_HASH_BITS);
282} 282}
283 283
284static void dir_hash_init(struct super_block *sb)
285{
286 struct msdos_sb_info *sbi = MSDOS_SB(sb);
287 int i;
288
289 spin_lock_init(&sbi->dir_hash_lock);
290 for (i = 0; i < FAT_HASH_SIZE; i++)
291 INIT_HLIST_HEAD(&sbi->dir_hashtable[i]);
292}
293
284void fat_attach(struct inode *inode, loff_t i_pos) 294void fat_attach(struct inode *inode, loff_t i_pos)
285{ 295{
286 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 296 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
287 struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
288 297
289 spin_lock(&sbi->inode_hash_lock); 298 if (inode->i_ino != MSDOS_ROOT_INO) {
290 MSDOS_I(inode)->i_pos = i_pos; 299 struct hlist_head *head = sbi->inode_hashtable
291 hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); 300 + fat_hash(i_pos);
292 spin_unlock(&sbi->inode_hash_lock); 301
302 spin_lock(&sbi->inode_hash_lock);
303 MSDOS_I(inode)->i_pos = i_pos;
304 hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
305 spin_unlock(&sbi->inode_hash_lock);
306 }
307
308 /* If NFS support is enabled, cache the mapping of start cluster
309 * to directory inode. This is used during reconnection of
310 * dentries to the filesystem root.
311 */
312 if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
313 struct hlist_head *d_head = sbi->dir_hashtable;
314 d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart);
315
316 spin_lock(&sbi->dir_hash_lock);
317 hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head);
318 spin_unlock(&sbi->dir_hash_lock);
319 }
293} 320}
294EXPORT_SYMBOL_GPL(fat_attach); 321EXPORT_SYMBOL_GPL(fat_attach);
295 322
@@ -300,6 +327,12 @@ void fat_detach(struct inode *inode)
300 MSDOS_I(inode)->i_pos = 0; 327 MSDOS_I(inode)->i_pos = 0;
301 hlist_del_init(&MSDOS_I(inode)->i_fat_hash); 328 hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
302 spin_unlock(&sbi->inode_hash_lock); 329 spin_unlock(&sbi->inode_hash_lock);
330
331 if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
332 spin_lock(&sbi->dir_hash_lock);
333 hlist_del_init(&MSDOS_I(inode)->i_dir_hash);
334 spin_unlock(&sbi->dir_hash_lock);
335 }
303} 336}
304EXPORT_SYMBOL_GPL(fat_detach); 337EXPORT_SYMBOL_GPL(fat_detach);
305 338
@@ -504,6 +537,7 @@ static void init_once(void *foo)
504 ei->cache_valid_id = FAT_CACHE_VALID + 1; 537 ei->cache_valid_id = FAT_CACHE_VALID + 1;
505 INIT_LIST_HEAD(&ei->cache_lru); 538 INIT_LIST_HEAD(&ei->cache_lru);
506 INIT_HLIST_NODE(&ei->i_fat_hash); 539 INIT_HLIST_NODE(&ei->i_fat_hash);
540 INIT_HLIST_NODE(&ei->i_dir_hash);
507 inode_init_once(&ei->vfs_inode); 541 inode_init_once(&ei->vfs_inode);
508} 542}
509 543
@@ -521,6 +555,11 @@ static int __init fat_init_inodecache(void)
521 555
522static void __exit fat_destroy_inodecache(void) 556static void __exit fat_destroy_inodecache(void)
523{ 557{
558 /*
559 * Make sure all delayed rcu free inodes are flushed before we
560 * destroy cache.
561 */
562 rcu_barrier();
524 kmem_cache_destroy(fat_inode_cachep); 563 kmem_cache_destroy(fat_inode_cachep);
525} 564}
526 565
@@ -634,9 +673,9 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
634 if (inode->i_ino == MSDOS_FSINFO_INO) { 673 if (inode->i_ino == MSDOS_FSINFO_INO) {
635 struct super_block *sb = inode->i_sb; 674 struct super_block *sb = inode->i_sb;
636 675
637 lock_super(sb); 676 mutex_lock(&MSDOS_SB(sb)->s_lock);
638 err = fat_clusters_flush(sb); 677 err = fat_clusters_flush(sb);
639 unlock_super(sb); 678 mutex_unlock(&MSDOS_SB(sb)->s_lock);
640 } else 679 } else
641 err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 680 err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
642 681
@@ -663,125 +702,9 @@ static const struct super_operations fat_sops = {
663 .show_options = fat_show_options, 702 .show_options = fat_show_options,
664}; 703};
665 704
666/*
667 * a FAT file handle with fhtype 3 is
668 * 0/ i_ino - for fast, reliable lookup if still in the cache
669 * 1/ i_generation - to see if i_ino is still valid
670 * bit 0 == 0 iff directory
671 * 2/ i_pos(8-39) - if ino has changed, but still in cache
672 * 3/ i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos
673 * 4/ i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc
674 *
675 * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum
676 * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits
677 * of i_logstart is used to store the directory entry offset.
678 */
679
680static struct dentry *fat_fh_to_dentry(struct super_block *sb,
681 struct fid *fid, int fh_len, int fh_type)
682{
683 struct inode *inode = NULL;
684 u32 *fh = fid->raw;
685
686 if (fh_len < 5 || fh_type != 3)
687 return NULL;
688
689 inode = ilookup(sb, fh[0]);
690 if (!inode || inode->i_generation != fh[1]) {
691 if (inode)
692 iput(inode);
693 inode = NULL;
694 }
695 if (!inode) {
696 loff_t i_pos;
697 int i_logstart = fh[3] & 0x0fffffff;
698
699 i_pos = (loff_t)fh[2] << 8;
700 i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28);
701
702 /* try 2 - see if i_pos is in F-d-c
703 * require i_logstart to be the same
704 * Will fail if you truncate and then re-write
705 */
706
707 inode = fat_iget(sb, i_pos);
708 if (inode && MSDOS_I(inode)->i_logstart != i_logstart) {
709 iput(inode);
710 inode = NULL;
711 }
712 }
713
714 /*
715 * For now, do nothing if the inode is not found.
716 *
717 * What we could do is:
718 *
719 * - follow the file starting at fh[4], and record the ".." entry,
720 * and the name of the fh[2] entry.
721 * - then follow the ".." file finding the next step up.
722 *
723 * This way we build a path to the root of the tree. If this works, we
724 * lookup the path and so get this inode into the cache. Finally try
725 * the fat_iget lookup again. If that fails, then we are totally out
726 * of luck. But all that is for another day
727 */
728 return d_obtain_alias(inode);
729}
730
731static int
732fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
733{
734 int len = *lenp;
735 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
736 loff_t i_pos;
737
738 if (len < 5) {
739 *lenp = 5;
740 return 255; /* no room */
741 }
742
743 i_pos = fat_i_pos_read(sbi, inode);
744 *lenp = 5;
745 fh[0] = inode->i_ino;
746 fh[1] = inode->i_generation;
747 fh[2] = i_pos >> 8;
748 fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart;
749 fh[4] = (i_pos & 0x0f) << 28;
750 if (parent)
751 fh[4] |= MSDOS_I(parent)->i_logstart;
752 return 3;
753}
754
755static struct dentry *fat_get_parent(struct dentry *child)
756{
757 struct super_block *sb = child->d_sb;
758 struct buffer_head *bh;
759 struct msdos_dir_entry *de;
760 loff_t i_pos;
761 struct dentry *parent;
762 struct inode *inode;
763 int err;
764
765 lock_super(sb);
766
767 err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
768 if (err) {
769 parent = ERR_PTR(err);
770 goto out;
771 }
772 inode = fat_build_inode(sb, de, i_pos);
773 brelse(bh);
774
775 parent = d_obtain_alias(inode);
776out:
777 unlock_super(sb);
778
779 return parent;
780}
781
782static const struct export_operations fat_export_ops = { 705static const struct export_operations fat_export_ops = {
783 .encode_fh = fat_encode_fh,
784 .fh_to_dentry = fat_fh_to_dentry, 706 .fh_to_dentry = fat_fh_to_dentry,
707 .fh_to_parent = fat_fh_to_parent,
785 .get_parent = fat_get_parent, 708 .get_parent = fat_get_parent,
786}; 709};
787 710
@@ -791,10 +714,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
791 struct fat_mount_options *opts = &sbi->options; 714 struct fat_mount_options *opts = &sbi->options;
792 int isvfat = opts->isvfat; 715 int isvfat = opts->isvfat;
793 716
794 if (opts->fs_uid != 0) 717 if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID))
795 seq_printf(m, ",uid=%u", opts->fs_uid); 718 seq_printf(m, ",uid=%u",
796 if (opts->fs_gid != 0) 719 from_kuid_munged(&init_user_ns, opts->fs_uid));
797 seq_printf(m, ",gid=%u", opts->fs_gid); 720 if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID))
721 seq_printf(m, ",gid=%u",
722 from_kgid_munged(&init_user_ns, opts->fs_gid));
798 seq_printf(m, ",fmask=%04o", opts->fs_fmask); 723 seq_printf(m, ",fmask=%04o", opts->fs_fmask);
799 seq_printf(m, ",dmask=%04o", opts->fs_dmask); 724 seq_printf(m, ",dmask=%04o", opts->fs_dmask);
800 if (opts->allow_utime) 725 if (opts->allow_utime)
@@ -829,6 +754,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
829 seq_puts(m, ",usefree"); 754 seq_puts(m, ",usefree");
830 if (opts->quiet) 755 if (opts->quiet)
831 seq_puts(m, ",quiet"); 756 seq_puts(m, ",quiet");
757 if (opts->nfs)
758 seq_puts(m, ",nfs");
832 if (opts->showexec) 759 if (opts->showexec)
833 seq_puts(m, ",showexec"); 760 seq_puts(m, ",showexec");
834 if (opts->sys_immutable) 761 if (opts->sys_immutable)
@@ -873,7 +800,7 @@ enum {
873 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, 800 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
874 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, 801 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
875 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, 802 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
876 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, 803 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err,
877}; 804};
878 805
879static const match_table_t fat_tokens = { 806static const match_table_t fat_tokens = {
@@ -902,6 +829,7 @@ static const match_table_t fat_tokens = {
902 {Opt_err_panic, "errors=panic"}, 829 {Opt_err_panic, "errors=panic"},
903 {Opt_err_ro, "errors=remount-ro"}, 830 {Opt_err_ro, "errors=remount-ro"},
904 {Opt_discard, "discard"}, 831 {Opt_discard, "discard"},
832 {Opt_nfs, "nfs"},
905 {Opt_obsolete, "conv=binary"}, 833 {Opt_obsolete, "conv=binary"},
906 {Opt_obsolete, "conv=text"}, 834 {Opt_obsolete, "conv=text"},
907 {Opt_obsolete, "conv=auto"}, 835 {Opt_obsolete, "conv=auto"},
@@ -982,6 +910,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
982 opts->numtail = 1; 910 opts->numtail = 1;
983 opts->usefree = opts->nocase = 0; 911 opts->usefree = opts->nocase = 0;
984 opts->tz_utc = 0; 912 opts->tz_utc = 0;
913 opts->nfs = 0;
985 opts->errors = FAT_ERRORS_RO; 914 opts->errors = FAT_ERRORS_RO;
986 *debug = 0; 915 *debug = 0;
987 916
@@ -1037,12 +966,16 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1037 case Opt_uid: 966 case Opt_uid:
1038 if (match_int(&args[0], &option)) 967 if (match_int(&args[0], &option))
1039 return 0; 968 return 0;
1040 opts->fs_uid = option; 969 opts->fs_uid = make_kuid(current_user_ns(), option);
970 if (!uid_valid(opts->fs_uid))
971 return 0;
1041 break; 972 break;
1042 case Opt_gid: 973 case Opt_gid:
1043 if (match_int(&args[0], &option)) 974 if (match_int(&args[0], &option))
1044 return 0; 975 return 0;
1045 opts->fs_gid = option; 976 opts->fs_gid = make_kgid(current_user_ns(), option);
977 if (!gid_valid(opts->fs_gid))
978 return 0;
1046 break; 979 break;
1047 case Opt_umask: 980 case Opt_umask:
1048 if (match_octal(&args[0], &option)) 981 if (match_octal(&args[0], &option))
@@ -1142,6 +1075,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1142 case Opt_discard: 1075 case Opt_discard:
1143 opts->discard = 1; 1076 opts->discard = 1;
1144 break; 1077 break;
1078 case Opt_nfs:
1079 opts->nfs = 1;
1080 break;
1145 1081
1146 /* obsolete mount options */ 1082 /* obsolete mount options */
1147 case Opt_obsolete: 1083 case Opt_obsolete:
@@ -1332,6 +1268,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1332 b = (struct fat_boot_sector *) bh->b_data; 1268 b = (struct fat_boot_sector *) bh->b_data;
1333 } 1269 }
1334 1270
1271 mutex_init(&sbi->s_lock);
1335 sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; 1272 sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
1336 sbi->cluster_bits = ffs(sbi->cluster_size) - 1; 1273 sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
1337 sbi->fats = b->fats; 1274 sbi->fats = b->fats;
@@ -1432,6 +1369,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1432 1369
1433 /* set up enough so that it can read an inode */ 1370 /* set up enough so that it can read an inode */
1434 fat_hash_init(sb); 1371 fat_hash_init(sb);
1372 dir_hash_init(sb);
1435 fat_ent_access_init(sb); 1373 fat_ent_access_init(sb);
1436 1374
1437 /* 1375 /*
@@ -1486,6 +1424,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1486 } 1424 }
1487 error = -ENOMEM; 1425 error = -ENOMEM;
1488 insert_inode_hash(root_inode); 1426 insert_inode_hash(root_inode);
1427 fat_attach(root_inode, 0);
1489 sb->s_root = d_make_root(root_inode); 1428 sb->s_root = d_make_root(root_inode);
1490 if (!sb->s_root) { 1429 if (!sb->s_root) {
1491 fat_msg(sb, KERN_ERR, "get root inode failed"); 1430 fat_msg(sb, KERN_ERR, "get root inode failed");
@@ -1525,18 +1464,14 @@ static int writeback_inode(struct inode *inode)
1525{ 1464{
1526 1465
1527 int ret; 1466 int ret;
1528 struct address_space *mapping = inode->i_mapping; 1467
1529 struct writeback_control wbc = { 1468 /* if we used wait=1, sync_inode_metadata waits for the io for the
1530 .sync_mode = WB_SYNC_NONE, 1469 * inode to finish. So wait=0 is sent down to sync_inode_metadata
1531 .nr_to_write = 0,
1532 };
1533 /* if we used WB_SYNC_ALL, sync_inode waits for the io for the
1534 * inode to finish. So WB_SYNC_NONE is sent down to sync_inode
1535 * and filemap_fdatawrite is used for the data blocks 1470 * and filemap_fdatawrite is used for the data blocks
1536 */ 1471 */
1537 ret = sync_inode(inode, &wbc); 1472 ret = sync_inode_metadata(inode, 0);
1538 if (!ret) 1473 if (!ret)
1539 ret = filemap_fdatawrite(mapping); 1474 ret = filemap_fdatawrite(inode->i_mapping);
1540 return ret; 1475 return ret;
1541} 1476}
1542 1477
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index b0e12bf9f4a1..e2cfda94a28d 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -208,7 +208,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
208 struct inode *inode; 208 struct inode *inode;
209 int err; 209 int err;
210 210
211 lock_super(sb); 211 mutex_lock(&MSDOS_SB(sb)->s_lock);
212 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 212 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
213 switch (err) { 213 switch (err) {
214 case -ENOENT: 214 case -ENOENT:
@@ -221,7 +221,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
221 default: 221 default:
222 inode = ERR_PTR(err); 222 inode = ERR_PTR(err);
223 } 223 }
224 unlock_super(sb); 224 mutex_unlock(&MSDOS_SB(sb)->s_lock);
225 return d_splice_alias(inode, dentry); 225 return d_splice_alias(inode, dentry);
226} 226}
227 227
@@ -273,7 +273,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
273 unsigned char msdos_name[MSDOS_NAME]; 273 unsigned char msdos_name[MSDOS_NAME];
274 int err, is_hid; 274 int err, is_hid;
275 275
276 lock_super(sb); 276 mutex_lock(&MSDOS_SB(sb)->s_lock);
277 277
278 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 278 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
279 msdos_name, &MSDOS_SB(sb)->options); 279 msdos_name, &MSDOS_SB(sb)->options);
@@ -302,7 +302,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
302 302
303 d_instantiate(dentry, inode); 303 d_instantiate(dentry, inode);
304out: 304out:
305 unlock_super(sb); 305 mutex_unlock(&MSDOS_SB(sb)->s_lock);
306 if (!err) 306 if (!err)
307 err = fat_flush_inodes(sb, dir, inode); 307 err = fat_flush_inodes(sb, dir, inode);
308 return err; 308 return err;
@@ -316,7 +316,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
316 struct fat_slot_info sinfo; 316 struct fat_slot_info sinfo;
317 int err; 317 int err;
318 318
319 lock_super(sb); 319 mutex_lock(&MSDOS_SB(sb)->s_lock);
320 /* 320 /*
321 * Check whether the directory is not in use, then check 321 * Check whether the directory is not in use, then check
322 * whether it is empty. 322 * whether it is empty.
@@ -337,7 +337,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
337 inode->i_ctime = CURRENT_TIME_SEC; 337 inode->i_ctime = CURRENT_TIME_SEC;
338 fat_detach(inode); 338 fat_detach(inode);
339out: 339out:
340 unlock_super(sb); 340 mutex_unlock(&MSDOS_SB(sb)->s_lock);
341 if (!err) 341 if (!err)
342 err = fat_flush_inodes(sb, dir, inode); 342 err = fat_flush_inodes(sb, dir, inode);
343 343
@@ -354,7 +354,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
354 struct timespec ts; 354 struct timespec ts;
355 int err, is_hid, cluster; 355 int err, is_hid, cluster;
356 356
357 lock_super(sb); 357 mutex_lock(&MSDOS_SB(sb)->s_lock);
358 358
359 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 359 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
360 msdos_name, &MSDOS_SB(sb)->options); 360 msdos_name, &MSDOS_SB(sb)->options);
@@ -392,14 +392,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
392 392
393 d_instantiate(dentry, inode); 393 d_instantiate(dentry, inode);
394 394
395 unlock_super(sb); 395 mutex_unlock(&MSDOS_SB(sb)->s_lock);
396 fat_flush_inodes(sb, dir, inode); 396 fat_flush_inodes(sb, dir, inode);
397 return 0; 397 return 0;
398 398
399out_free: 399out_free:
400 fat_free_clusters(dir, cluster); 400 fat_free_clusters(dir, cluster);
401out: 401out:
402 unlock_super(sb); 402 mutex_unlock(&MSDOS_SB(sb)->s_lock);
403 return err; 403 return err;
404} 404}
405 405
@@ -407,11 +407,11 @@ out:
407static int msdos_unlink(struct inode *dir, struct dentry *dentry) 407static int msdos_unlink(struct inode *dir, struct dentry *dentry)
408{ 408{
409 struct inode *inode = dentry->d_inode; 409 struct inode *inode = dentry->d_inode;
410 struct super_block *sb= inode->i_sb; 410 struct super_block *sb = inode->i_sb;
411 struct fat_slot_info sinfo; 411 struct fat_slot_info sinfo;
412 int err; 412 int err;
413 413
414 lock_super(sb); 414 mutex_lock(&MSDOS_SB(sb)->s_lock);
415 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 415 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
416 if (err) 416 if (err)
417 goto out; 417 goto out;
@@ -423,7 +423,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
423 inode->i_ctime = CURRENT_TIME_SEC; 423 inode->i_ctime = CURRENT_TIME_SEC;
424 fat_detach(inode); 424 fat_detach(inode);
425out: 425out:
426 unlock_super(sb); 426 mutex_unlock(&MSDOS_SB(sb)->s_lock);
427 if (!err) 427 if (!err)
428 err = fat_flush_inodes(sb, dir, inode); 428 err = fat_flush_inodes(sb, dir, inode);
429 429
@@ -440,7 +440,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
440 struct inode *old_inode, *new_inode; 440 struct inode *old_inode, *new_inode;
441 struct fat_slot_info old_sinfo, sinfo; 441 struct fat_slot_info old_sinfo, sinfo;
442 struct timespec ts; 442 struct timespec ts;
443 loff_t dotdot_i_pos, new_i_pos; 443 loff_t new_i_pos;
444 int err, old_attrs, is_dir, update_dotdot, corrupt = 0; 444 int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
445 445
446 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 446 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -456,8 +456,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
456 is_dir = S_ISDIR(old_inode->i_mode); 456 is_dir = S_ISDIR(old_inode->i_mode);
457 update_dotdot = (is_dir && old_dir != new_dir); 457 update_dotdot = (is_dir && old_dir != new_dir);
458 if (update_dotdot) { 458 if (update_dotdot) {
459 if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, 459 if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
460 &dotdot_i_pos) < 0) {
461 err = -EIO; 460 err = -EIO;
462 goto out; 461 goto out;
463 } 462 }
@@ -607,7 +606,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
607 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; 606 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
608 int err, is_hid; 607 int err, is_hid;
609 608
610 lock_super(sb); 609 mutex_lock(&MSDOS_SB(sb)->s_lock);
611 610
612 err = msdos_format_name(old_dentry->d_name.name, 611 err = msdos_format_name(old_dentry->d_name.name,
613 old_dentry->d_name.len, old_msdos_name, 612 old_dentry->d_name.len, old_msdos_name,
@@ -626,7 +625,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
626 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, 625 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
627 new_dir, new_msdos_name, new_dentry, is_hid); 626 new_dir, new_msdos_name, new_dentry, is_hid);
628out: 627out:
629 unlock_super(sb); 628 mutex_unlock(&MSDOS_SB(sb)->s_lock);
630 if (!err) 629 if (!err)
631 err = fat_flush_inodes(sb, old_dir, new_dir); 630 err = fat_flush_inodes(sb, old_dir, new_dir);
632 return err; 631 return err;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6a6d8c0715a1..ac959d655e7d 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -721,7 +721,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
721 struct dentry *alias; 721 struct dentry *alias;
722 int err; 722 int err;
723 723
724 lock_super(sb); 724 mutex_lock(&MSDOS_SB(sb)->s_lock);
725 725
726 err = vfat_find(dir, &dentry->d_name, &sinfo); 726 err = vfat_find(dir, &dentry->d_name, &sinfo);
727 if (err) { 727 if (err) {
@@ -752,13 +752,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
752 if (!S_ISDIR(inode->i_mode)) 752 if (!S_ISDIR(inode->i_mode))
753 d_move(alias, dentry); 753 d_move(alias, dentry);
754 iput(inode); 754 iput(inode);
755 unlock_super(sb); 755 mutex_unlock(&MSDOS_SB(sb)->s_lock);
756 return alias; 756 return alias;
757 } else 757 } else
758 dput(alias); 758 dput(alias);
759 759
760out: 760out:
761 unlock_super(sb); 761 mutex_unlock(&MSDOS_SB(sb)->s_lock);
762 dentry->d_time = dentry->d_parent->d_inode->i_version; 762 dentry->d_time = dentry->d_parent->d_inode->i_version;
763 dentry = d_splice_alias(inode, dentry); 763 dentry = d_splice_alias(inode, dentry);
764 if (dentry) 764 if (dentry)
@@ -766,7 +766,7 @@ out:
766 return dentry; 766 return dentry;
767 767
768error: 768error:
769 unlock_super(sb); 769 mutex_unlock(&MSDOS_SB(sb)->s_lock);
770 return ERR_PTR(err); 770 return ERR_PTR(err);
771} 771}
772 772
@@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
779 struct timespec ts; 779 struct timespec ts;
780 int err; 780 int err;
781 781
782 lock_super(sb); 782 mutex_lock(&MSDOS_SB(sb)->s_lock);
783 783
784 ts = CURRENT_TIME_SEC; 784 ts = CURRENT_TIME_SEC;
785 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); 785 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -800,7 +800,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
800 dentry->d_time = dentry->d_parent->d_inode->i_version; 800 dentry->d_time = dentry->d_parent->d_inode->i_version;
801 d_instantiate(dentry, inode); 801 d_instantiate(dentry, inode);
802out: 802out:
803 unlock_super(sb); 803 mutex_unlock(&MSDOS_SB(sb)->s_lock);
804 return err; 804 return err;
805} 805}
806 806
@@ -811,7 +811,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
811 struct fat_slot_info sinfo; 811 struct fat_slot_info sinfo;
812 int err; 812 int err;
813 813
814 lock_super(sb); 814 mutex_lock(&MSDOS_SB(sb)->s_lock);
815 815
816 err = fat_dir_empty(inode); 816 err = fat_dir_empty(inode);
817 if (err) 817 if (err)
@@ -829,7 +829,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
829 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 829 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
830 fat_detach(inode); 830 fat_detach(inode);
831out: 831out:
832 unlock_super(sb); 832 mutex_unlock(&MSDOS_SB(sb)->s_lock);
833 833
834 return err; 834 return err;
835} 835}
@@ -841,7 +841,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
841 struct fat_slot_info sinfo; 841 struct fat_slot_info sinfo;
842 int err; 842 int err;
843 843
844 lock_super(sb); 844 mutex_lock(&MSDOS_SB(sb)->s_lock);
845 845
846 err = vfat_find(dir, &dentry->d_name, &sinfo); 846 err = vfat_find(dir, &dentry->d_name, &sinfo);
847 if (err) 847 if (err)
@@ -854,7 +854,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
854 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 854 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
855 fat_detach(inode); 855 fat_detach(inode);
856out: 856out:
857 unlock_super(sb); 857 mutex_unlock(&MSDOS_SB(sb)->s_lock);
858 858
859 return err; 859 return err;
860} 860}
@@ -867,7 +867,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
867 struct timespec ts; 867 struct timespec ts;
868 int err, cluster; 868 int err, cluster;
869 869
870 lock_super(sb); 870 mutex_lock(&MSDOS_SB(sb)->s_lock);
871 871
872 ts = CURRENT_TIME_SEC; 872 ts = CURRENT_TIME_SEC;
873 cluster = fat_alloc_new_dir(dir, &ts); 873 cluster = fat_alloc_new_dir(dir, &ts);
@@ -896,13 +896,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
896 dentry->d_time = dentry->d_parent->d_inode->i_version; 896 dentry->d_time = dentry->d_parent->d_inode->i_version;
897 d_instantiate(dentry, inode); 897 d_instantiate(dentry, inode);
898 898
899 unlock_super(sb); 899 mutex_unlock(&MSDOS_SB(sb)->s_lock);
900 return 0; 900 return 0;
901 901
902out_free: 902out_free:
903 fat_free_clusters(dir, cluster); 903 fat_free_clusters(dir, cluster);
904out: 904out:
905 unlock_super(sb); 905 mutex_unlock(&MSDOS_SB(sb)->s_lock);
906 return err; 906 return err;
907} 907}
908 908
@@ -914,14 +914,14 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
914 struct inode *old_inode, *new_inode; 914 struct inode *old_inode, *new_inode;
915 struct fat_slot_info old_sinfo, sinfo; 915 struct fat_slot_info old_sinfo, sinfo;
916 struct timespec ts; 916 struct timespec ts;
917 loff_t dotdot_i_pos, new_i_pos; 917 loff_t new_i_pos;
918 int err, is_dir, update_dotdot, corrupt = 0; 918 int err, is_dir, update_dotdot, corrupt = 0;
919 struct super_block *sb = old_dir->i_sb; 919 struct super_block *sb = old_dir->i_sb;
920 920
921 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 921 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
922 old_inode = old_dentry->d_inode; 922 old_inode = old_dentry->d_inode;
923 new_inode = new_dentry->d_inode; 923 new_inode = new_dentry->d_inode;
924 lock_super(sb); 924 mutex_lock(&MSDOS_SB(sb)->s_lock);
925 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); 925 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
926 if (err) 926 if (err)
927 goto out; 927 goto out;
@@ -929,8 +929,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
929 is_dir = S_ISDIR(old_inode->i_mode); 929 is_dir = S_ISDIR(old_inode->i_mode);
930 update_dotdot = (is_dir && old_dir != new_dir); 930 update_dotdot = (is_dir && old_dir != new_dir);
931 if (update_dotdot) { 931 if (update_dotdot) {
932 if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, 932 if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
933 &dotdot_i_pos) < 0) {
934 err = -EIO; 933 err = -EIO;
935 goto out; 934 goto out;
936 } 935 }
@@ -997,7 +996,7 @@ out:
997 brelse(sinfo.bh); 996 brelse(sinfo.bh);
998 brelse(dotdot_bh); 997 brelse(dotdot_bh);
999 brelse(old_sinfo.bh); 998 brelse(old_sinfo.bh);
1000 unlock_super(sb); 999 mutex_unlock(&MSDOS_SB(sb)->s_lock);
1001 1000
1002 return err; 1001 return err;
1003 1002
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
new file mode 100644
index 000000000000..ef4b5faba87b
--- /dev/null
+++ b/fs/fat/nfs.c
@@ -0,0 +1,101 @@
1/* fs/fat/nfs.c
2 *
3 * This software is licensed under the terms of the GNU General Public
4 * License version 2, as published by the Free Software Foundation, and
5 * may be copied, distributed, and modified under those terms.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 */
13
14#include <linux/exportfs.h>
15#include "fat.h"
16
17/**
18 * Look up a directory inode given its starting cluster.
19 */
20static struct inode *fat_dget(struct super_block *sb, int i_logstart)
21{
22 struct msdos_sb_info *sbi = MSDOS_SB(sb);
23 struct hlist_head *head;
24 struct hlist_node *_p;
25 struct msdos_inode_info *i;
26 struct inode *inode = NULL;
27
28 head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
29 spin_lock(&sbi->dir_hash_lock);
30 hlist_for_each_entry(i, _p, head, i_dir_hash) {
31 BUG_ON(i->vfs_inode.i_sb != sb);
32 if (i->i_logstart != i_logstart)
33 continue;
34 inode = igrab(&i->vfs_inode);
35 if (inode)
36 break;
37 }
38 spin_unlock(&sbi->dir_hash_lock);
39 return inode;
40}
41
42static struct inode *fat_nfs_get_inode(struct super_block *sb,
43 u64 ino, u32 generation)
44{
45 struct inode *inode;
46
47 if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
48 return NULL;
49
50 inode = ilookup(sb, ino);
51 if (inode && generation && (inode->i_generation != generation)) {
52 iput(inode);
53 inode = NULL;
54 }
55
56 return inode;
57}
58
59/**
60 * Map a NFS file handle to a corresponding dentry.
61 * The dentry may or may not be connected to the filesystem root.
62 */
63struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
64 int fh_len, int fh_type)
65{
66 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
67 fat_nfs_get_inode);
68}
69
70/*
71 * Find the parent for a file specified by NFS handle.
72 * This requires that the handle contain the i_ino of the parent.
73 */
74struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
75 int fh_len, int fh_type)
76{
77 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
78 fat_nfs_get_inode);
79}
80
81/*
82 * Find the parent for a directory that is not currently connected to
83 * the filesystem root.
84 *
85 * On entry, the caller holds child_dir->d_inode->i_mutex.
86 */
87struct dentry *fat_get_parent(struct dentry *child_dir)
88{
89 struct super_block *sb = child_dir->d_sb;
90 struct buffer_head *bh = NULL;
91 struct msdos_dir_entry *de;
92 struct inode *parent_inode = NULL;
93
94 if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
95 int parent_logstart = fat_get_start(MSDOS_SB(sb), de);
96 parent_inode = fat_dget(sb, parent_logstart);
97 }
98 brelse(bh);
99
100 return d_obtain_alias(parent_inode);
101}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 887b5ba8c9b5..71a600a19f06 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -26,124 +26,6 @@
26#include <asm/siginfo.h> 26#include <asm/siginfo.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28 28
29void set_close_on_exec(unsigned int fd, int flag)
30{
31 struct files_struct *files = current->files;
32 struct fdtable *fdt;
33 spin_lock(&files->file_lock);
34 fdt = files_fdtable(files);
35 if (flag)
36 __set_close_on_exec(fd, fdt);
37 else
38 __clear_close_on_exec(fd, fdt);
39 spin_unlock(&files->file_lock);
40}
41
42static bool get_close_on_exec(unsigned int fd)
43{
44 struct files_struct *files = current->files;
45 struct fdtable *fdt;
46 bool res;
47 rcu_read_lock();
48 fdt = files_fdtable(files);
49 res = close_on_exec(fd, fdt);
50 rcu_read_unlock();
51 return res;
52}
53
54SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
55{
56 int err = -EBADF;
57 struct file * file, *tofree;
58 struct files_struct * files = current->files;
59 struct fdtable *fdt;
60
61 if ((flags & ~O_CLOEXEC) != 0)
62 return -EINVAL;
63
64 if (unlikely(oldfd == newfd))
65 return -EINVAL;
66
67 spin_lock(&files->file_lock);
68 err = expand_files(files, newfd);
69 file = fcheck(oldfd);
70 if (unlikely(!file))
71 goto Ebadf;
72 if (unlikely(err < 0)) {
73 if (err == -EMFILE)
74 goto Ebadf;
75 goto out_unlock;
76 }
77 /*
78 * We need to detect attempts to do dup2() over allocated but still
79 * not finished descriptor. NB: OpenBSD avoids that at the price of
80 * extra work in their equivalent of fget() - they insert struct
81 * file immediately after grabbing descriptor, mark it larval if
82 * more work (e.g. actual opening) is needed and make sure that
83 * fget() treats larval files as absent. Potentially interesting,
84 * but while extra work in fget() is trivial, locking implications
85 * and amount of surgery on open()-related paths in VFS are not.
86 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
87 * deadlocks in rather amusing ways, AFAICS. All of that is out of
88 * scope of POSIX or SUS, since neither considers shared descriptor
89 * tables and this condition does not arise without those.
90 */
91 err = -EBUSY;
92 fdt = files_fdtable(files);
93 tofree = fdt->fd[newfd];
94 if (!tofree && fd_is_open(newfd, fdt))
95 goto out_unlock;
96 get_file(file);
97 rcu_assign_pointer(fdt->fd[newfd], file);
98 __set_open_fd(newfd, fdt);
99 if (flags & O_CLOEXEC)
100 __set_close_on_exec(newfd, fdt);
101 else
102 __clear_close_on_exec(newfd, fdt);
103 spin_unlock(&files->file_lock);
104
105 if (tofree)
106 filp_close(tofree, files);
107
108 return newfd;
109
110Ebadf:
111 err = -EBADF;
112out_unlock:
113 spin_unlock(&files->file_lock);
114 return err;
115}
116
117SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
118{
119 if (unlikely(newfd == oldfd)) { /* corner case */
120 struct files_struct *files = current->files;
121 int retval = oldfd;
122
123 rcu_read_lock();
124 if (!fcheck_files(files, oldfd))
125 retval = -EBADF;
126 rcu_read_unlock();
127 return retval;
128 }
129 return sys_dup3(oldfd, newfd, 0);
130}
131
132SYSCALL_DEFINE1(dup, unsigned int, fildes)
133{
134 int ret = -EBADF;
135 struct file *file = fget_raw(fildes);
136
137 if (file) {
138 ret = get_unused_fd();
139 if (ret >= 0)
140 fd_install(ret, file);
141 else
142 fput(file);
143 }
144 return ret;
145}
146
147#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) 29#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
148 30
149static int setfl(int fd, struct file * filp, unsigned long arg) 31static int setfl(int fd, struct file * filp, unsigned long arg)
@@ -267,7 +149,7 @@ pid_t f_getown(struct file *filp)
267 149
268static int f_setown_ex(struct file *filp, unsigned long arg) 150static int f_setown_ex(struct file *filp, unsigned long arg)
269{ 151{
270 struct f_owner_ex * __user owner_p = (void * __user)arg; 152 struct f_owner_ex __user *owner_p = (void __user *)arg;
271 struct f_owner_ex owner; 153 struct f_owner_ex owner;
272 struct pid *pid; 154 struct pid *pid;
273 int type; 155 int type;
@@ -307,7 +189,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
307 189
308static int f_getown_ex(struct file *filp, unsigned long arg) 190static int f_getown_ex(struct file *filp, unsigned long arg)
309{ 191{
310 struct f_owner_ex * __user owner_p = (void * __user)arg; 192 struct f_owner_ex __user *owner_p = (void __user *)arg;
311 struct f_owner_ex owner; 193 struct f_owner_ex owner;
312 int ret = 0; 194 int ret = 0;
313 195
@@ -345,7 +227,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
345static int f_getowner_uids(struct file *filp, unsigned long arg) 227static int f_getowner_uids(struct file *filp, unsigned long arg)
346{ 228{
347 struct user_namespace *user_ns = current_user_ns(); 229 struct user_namespace *user_ns = current_user_ns();
348 uid_t * __user dst = (void * __user)arg; 230 uid_t __user *dst = (void __user *)arg;
349 uid_t src[2]; 231 uid_t src[2];
350 int err; 232 int err;
351 233
@@ -373,14 +255,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
373 255
374 switch (cmd) { 256 switch (cmd) {
375 case F_DUPFD: 257 case F_DUPFD:
258 err = f_dupfd(arg, filp, 0);
259 break;
376 case F_DUPFD_CLOEXEC: 260 case F_DUPFD_CLOEXEC:
377 if (arg >= rlimit(RLIMIT_NOFILE)) 261 err = f_dupfd(arg, filp, O_CLOEXEC);
378 break;
379 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
380 if (err >= 0) {
381 get_file(filp);
382 fd_install(err, filp);
383 }
384 break; 262 break;
385 case F_GETFD: 263 case F_GETFD:
386 err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; 264 err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
@@ -470,25 +348,23 @@ static int check_fcntl_cmd(unsigned cmd)
470 348
471SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 349SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
472{ 350{
473 struct file *filp; 351 struct fd f = fdget_raw(fd);
474 int fput_needed;
475 long err = -EBADF; 352 long err = -EBADF;
476 353
477 filp = fget_raw_light(fd, &fput_needed); 354 if (!f.file)
478 if (!filp)
479 goto out; 355 goto out;
480 356
481 if (unlikely(filp->f_mode & FMODE_PATH)) { 357 if (unlikely(f.file->f_mode & FMODE_PATH)) {
482 if (!check_fcntl_cmd(cmd)) 358 if (!check_fcntl_cmd(cmd))
483 goto out1; 359 goto out1;
484 } 360 }
485 361
486 err = security_file_fcntl(filp, cmd, arg); 362 err = security_file_fcntl(f.file, cmd, arg);
487 if (!err) 363 if (!err)
488 err = do_fcntl(fd, cmd, arg, filp); 364 err = do_fcntl(fd, cmd, arg, f.file);
489 365
490out1: 366out1:
491 fput_light(filp, fput_needed); 367 fdput(f);
492out: 368out:
493 return err; 369 return err;
494} 370}
@@ -497,38 +373,36 @@ out:
497SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, 373SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
498 unsigned long, arg) 374 unsigned long, arg)
499{ 375{
500 struct file * filp; 376 struct fd f = fdget_raw(fd);
501 long err = -EBADF; 377 long err = -EBADF;
502 int fput_needed;
503 378
504 filp = fget_raw_light(fd, &fput_needed); 379 if (!f.file)
505 if (!filp)
506 goto out; 380 goto out;
507 381
508 if (unlikely(filp->f_mode & FMODE_PATH)) { 382 if (unlikely(f.file->f_mode & FMODE_PATH)) {
509 if (!check_fcntl_cmd(cmd)) 383 if (!check_fcntl_cmd(cmd))
510 goto out1; 384 goto out1;
511 } 385 }
512 386
513 err = security_file_fcntl(filp, cmd, arg); 387 err = security_file_fcntl(f.file, cmd, arg);
514 if (err) 388 if (err)
515 goto out1; 389 goto out1;
516 390
517 switch (cmd) { 391 switch (cmd) {
518 case F_GETLK64: 392 case F_GETLK64:
519 err = fcntl_getlk64(filp, (struct flock64 __user *) arg); 393 err = fcntl_getlk64(f.file, (struct flock64 __user *) arg);
520 break; 394 break;
521 case F_SETLK64: 395 case F_SETLK64:
522 case F_SETLKW64: 396 case F_SETLKW64:
523 err = fcntl_setlk64(fd, filp, cmd, 397 err = fcntl_setlk64(fd, f.file, cmd,
524 (struct flock64 __user *) arg); 398 (struct flock64 __user *) arg);
525 break; 399 break;
526 default: 400 default:
527 err = do_fcntl(fd, cmd, arg, filp); 401 err = do_fcntl(fd, cmd, arg, f.file);
528 break; 402 break;
529 } 403 }
530out1: 404out1:
531 fput_light(filp, fput_needed); 405 fdput(f);
532out: 406out:
533 return err; 407 return err;
534} 408}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index a48e4a139be1..f775bfdd6e4a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -113,24 +113,21 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
113 113
114static struct vfsmount *get_vfsmount_from_fd(int fd) 114static struct vfsmount *get_vfsmount_from_fd(int fd)
115{ 115{
116 struct path path; 116 struct vfsmount *mnt;
117 117
118 if (fd == AT_FDCWD) { 118 if (fd == AT_FDCWD) {
119 struct fs_struct *fs = current->fs; 119 struct fs_struct *fs = current->fs;
120 spin_lock(&fs->lock); 120 spin_lock(&fs->lock);
121 path = fs->pwd; 121 mnt = mntget(fs->pwd.mnt);
122 mntget(path.mnt);
123 spin_unlock(&fs->lock); 122 spin_unlock(&fs->lock);
124 } else { 123 } else {
125 int fput_needed; 124 struct fd f = fdget(fd);
126 struct file *file = fget_light(fd, &fput_needed); 125 if (!f.file)
127 if (!file)
128 return ERR_PTR(-EBADF); 126 return ERR_PTR(-EBADF);
129 path = file->f_path; 127 mnt = mntget(f.file->f_path.mnt);
130 mntget(path.mnt); 128 fdput(f);
131 fput_light(file, fput_needed);
132 } 129 }
133 return path.mnt; 130 return mnt;
134} 131}
135 132
136static int vfs_dentry_acceptable(void *context, struct dentry *dentry) 133static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
diff --git a/fs/file.c b/fs/file.c
index ba3f6053025c..d3b5fa80b71b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,6 +6,7 @@
6 * Manage the dynamic fd arrays in the process files_struct. 6 * Manage the dynamic fd arrays in the process files_struct.
7 */ 7 */
8 8
9#include <linux/syscalls.h>
9#include <linux/export.h> 10#include <linux/export.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -84,22 +85,14 @@ static void free_fdtable_work(struct work_struct *work)
84 } 85 }
85} 86}
86 87
87void free_fdtable_rcu(struct rcu_head *rcu) 88static void free_fdtable_rcu(struct rcu_head *rcu)
88{ 89{
89 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); 90 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
90 struct fdtable_defer *fddef; 91 struct fdtable_defer *fddef;
91 92
92 BUG_ON(!fdt); 93 BUG_ON(!fdt);
94 BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
93 95
94 if (fdt->max_fds <= NR_OPEN_DEFAULT) {
95 /*
96 * This fdtable is embedded in the files structure and that
97 * structure itself is getting destroyed.
98 */
99 kmem_cache_free(files_cachep,
100 container_of(fdt, struct files_struct, fdtab));
101 return;
102 }
103 if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { 96 if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
104 kfree(fdt->fd); 97 kfree(fdt->fd);
105 kfree(fdt->open_fds); 98 kfree(fdt->open_fds);
@@ -229,7 +222,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
229 copy_fdtable(new_fdt, cur_fdt); 222 copy_fdtable(new_fdt, cur_fdt);
230 rcu_assign_pointer(files->fdt, new_fdt); 223 rcu_assign_pointer(files->fdt, new_fdt);
231 if (cur_fdt->max_fds > NR_OPEN_DEFAULT) 224 if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
232 free_fdtable(cur_fdt); 225 call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
233 } else { 226 } else {
234 /* Somebody else expanded, so undo our attempt */ 227 /* Somebody else expanded, so undo our attempt */
235 __free_fdtable(new_fdt); 228 __free_fdtable(new_fdt);
@@ -245,19 +238,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
245 * expanded and execution may have blocked. 238 * expanded and execution may have blocked.
246 * The files->file_lock should be held on entry, and will be held on exit. 239 * The files->file_lock should be held on entry, and will be held on exit.
247 */ 240 */
248int expand_files(struct files_struct *files, int nr) 241static int expand_files(struct files_struct *files, int nr)
249{ 242{
250 struct fdtable *fdt; 243 struct fdtable *fdt;
251 244
252 fdt = files_fdtable(files); 245 fdt = files_fdtable(files);
253 246
254 /*
255 * N.B. For clone tasks sharing a files structure, this test
256 * will limit the total number of files that can be opened.
257 */
258 if (nr >= rlimit(RLIMIT_NOFILE))
259 return -EMFILE;
260
261 /* Do we need to expand? */ 247 /* Do we need to expand? */
262 if (nr < fdt->max_fds) 248 if (nr < fdt->max_fds)
263 return 0; 249 return 0;
@@ -270,6 +256,26 @@ int expand_files(struct files_struct *files, int nr)
270 return expand_fdtable(files, nr); 256 return expand_fdtable(files, nr);
271} 257}
272 258
259static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
260{
261 __set_bit(fd, fdt->close_on_exec);
262}
263
264static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
265{
266 __clear_bit(fd, fdt->close_on_exec);
267}
268
269static inline void __set_open_fd(int fd, struct fdtable *fdt)
270{
271 __set_bit(fd, fdt->open_fds);
272}
273
274static inline void __clear_open_fd(int fd, struct fdtable *fdt)
275{
276 __clear_bit(fd, fdt->open_fds);
277}
278
273static int count_open_files(struct fdtable *fdt) 279static int count_open_files(struct fdtable *fdt)
274{ 280{
275 int size = fdt->max_fds; 281 int size = fdt->max_fds;
@@ -395,6 +401,95 @@ out:
395 return NULL; 401 return NULL;
396} 402}
397 403
404static void close_files(struct files_struct * files)
405{
406 int i, j;
407 struct fdtable *fdt;
408
409 j = 0;
410
411 /*
412 * It is safe to dereference the fd table without RCU or
413 * ->file_lock because this is the last reference to the
414 * files structure. But use RCU to shut RCU-lockdep up.
415 */
416 rcu_read_lock();
417 fdt = files_fdtable(files);
418 rcu_read_unlock();
419 for (;;) {
420 unsigned long set;
421 i = j * BITS_PER_LONG;
422 if (i >= fdt->max_fds)
423 break;
424 set = fdt->open_fds[j++];
425 while (set) {
426 if (set & 1) {
427 struct file * file = xchg(&fdt->fd[i], NULL);
428 if (file) {
429 filp_close(file, files);
430 cond_resched();
431 }
432 }
433 i++;
434 set >>= 1;
435 }
436 }
437}
438
439struct files_struct *get_files_struct(struct task_struct *task)
440{
441 struct files_struct *files;
442
443 task_lock(task);
444 files = task->files;
445 if (files)
446 atomic_inc(&files->count);
447 task_unlock(task);
448
449 return files;
450}
451
452void put_files_struct(struct files_struct *files)
453{
454 struct fdtable *fdt;
455
456 if (atomic_dec_and_test(&files->count)) {
457 close_files(files);
458 /* not really needed, since nobody can see us */
459 rcu_read_lock();
460 fdt = files_fdtable(files);
461 rcu_read_unlock();
462 /* free the arrays if they are not embedded */
463 if (fdt != &files->fdtab)
464 __free_fdtable(fdt);
465 kmem_cache_free(files_cachep, files);
466 }
467}
468
469void reset_files_struct(struct files_struct *files)
470{
471 struct task_struct *tsk = current;
472 struct files_struct *old;
473
474 old = tsk->files;
475 task_lock(tsk);
476 tsk->files = files;
477 task_unlock(tsk);
478 put_files_struct(old);
479}
480
481void exit_files(struct task_struct *tsk)
482{
483 struct files_struct * files = tsk->files;
484
485 if (files) {
486 task_lock(tsk);
487 tsk->files = NULL;
488 task_unlock(tsk);
489 put_files_struct(files);
490 }
491}
492
398static void __devinit fdtable_defer_list_init(int cpu) 493static void __devinit fdtable_defer_list_init(int cpu)
399{ 494{
400 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); 495 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
@@ -424,12 +519,18 @@ struct files_struct init_files = {
424 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
425}; 520};
426 521
522void daemonize_descriptors(void)
523{
524 atomic_inc(&init_files.count);
525 reset_files_struct(&init_files);
526}
527
427/* 528/*
428 * allocate a file descriptor, mark it busy. 529 * allocate a file descriptor, mark it busy.
429 */ 530 */
430int alloc_fd(unsigned start, unsigned flags) 531int __alloc_fd(struct files_struct *files,
532 unsigned start, unsigned end, unsigned flags)
431{ 533{
432 struct files_struct *files = current->files;
433 unsigned int fd; 534 unsigned int fd;
434 int error; 535 int error;
435 struct fdtable *fdt; 536 struct fdtable *fdt;
@@ -444,6 +545,14 @@ repeat:
444 if (fd < fdt->max_fds) 545 if (fd < fdt->max_fds)
445 fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); 546 fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
446 547
548 /*
549 * N.B. For clone tasks sharing a files structure, this test
550 * will limit the total number of files that can be opened.
551 */
552 error = -EMFILE;
553 if (fd >= end)
554 goto out;
555
447 error = expand_files(files, fd); 556 error = expand_files(files, fd);
448 if (error < 0) 557 if (error < 0)
449 goto out; 558 goto out;
@@ -477,8 +586,427 @@ out:
477 return error; 586 return error;
478} 587}
479 588
480int get_unused_fd(void) 589static int alloc_fd(unsigned start, unsigned flags)
590{
591 return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
592}
593
594int get_unused_fd_flags(unsigned flags)
595{
596 return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
597}
598EXPORT_SYMBOL(get_unused_fd_flags);
599
600static void __put_unused_fd(struct files_struct *files, unsigned int fd)
601{
602 struct fdtable *fdt = files_fdtable(files);
603 __clear_open_fd(fd, fdt);
604 if (fd < files->next_fd)
605 files->next_fd = fd;
606}
607
608void put_unused_fd(unsigned int fd)
609{
610 struct files_struct *files = current->files;
611 spin_lock(&files->file_lock);
612 __put_unused_fd(files, fd);
613 spin_unlock(&files->file_lock);
614}
615
616EXPORT_SYMBOL(put_unused_fd);
617
618/*
619 * Install a file pointer in the fd array.
620 *
621 * The VFS is full of places where we drop the files lock between
622 * setting the open_fds bitmap and installing the file in the file
623 * array. At any such point, we are vulnerable to a dup2() race
624 * installing a file in the array before us. We need to detect this and
625 * fput() the struct file we are about to overwrite in this case.
626 *
627 * It should never happen - if we allow dup2() do it, _really_ bad things
628 * will follow.
629 *
630 * NOTE: __fd_install() variant is really, really low-level; don't
631 * use it unless you are forced to by truly lousy API shoved down
632 * your throat. 'files' *MUST* be either current->files or obtained
633 * by get_files_struct(current) done by whoever had given it to you,
634 * or really bad things will happen. Normally you want to use
635 * fd_install() instead.
636 */
637
638void __fd_install(struct files_struct *files, unsigned int fd,
639 struct file *file)
640{
641 struct fdtable *fdt;
642 spin_lock(&files->file_lock);
643 fdt = files_fdtable(files);
644 BUG_ON(fdt->fd[fd] != NULL);
645 rcu_assign_pointer(fdt->fd[fd], file);
646 spin_unlock(&files->file_lock);
647}
648
649void fd_install(unsigned int fd, struct file *file)
650{
651 __fd_install(current->files, fd, file);
652}
653
654EXPORT_SYMBOL(fd_install);
655
656/*
657 * The same warnings as for __alloc_fd()/__fd_install() apply here...
658 */
659int __close_fd(struct files_struct *files, unsigned fd)
660{
661 struct file *file;
662 struct fdtable *fdt;
663
664 spin_lock(&files->file_lock);
665 fdt = files_fdtable(files);
666 if (fd >= fdt->max_fds)
667 goto out_unlock;
668 file = fdt->fd[fd];
669 if (!file)
670 goto out_unlock;
671 rcu_assign_pointer(fdt->fd[fd], NULL);
672 __clear_close_on_exec(fd, fdt);
673 __put_unused_fd(files, fd);
674 spin_unlock(&files->file_lock);
675 return filp_close(file, files);
676
677out_unlock:
678 spin_unlock(&files->file_lock);
679 return -EBADF;
680}
681
682void do_close_on_exec(struct files_struct *files)
683{
684 unsigned i;
685 struct fdtable *fdt;
686
687 /* exec unshares first */
688 BUG_ON(atomic_read(&files->count) != 1);
689 spin_lock(&files->file_lock);
690 for (i = 0; ; i++) {
691 unsigned long set;
692 unsigned fd = i * BITS_PER_LONG;
693 fdt = files_fdtable(files);
694 if (fd >= fdt->max_fds)
695 break;
696 set = fdt->close_on_exec[i];
697 if (!set)
698 continue;
699 fdt->close_on_exec[i] = 0;
700 for ( ; set ; fd++, set >>= 1) {
701 struct file *file;
702 if (!(set & 1))
703 continue;
704 file = fdt->fd[fd];
705 if (!file)
706 continue;
707 rcu_assign_pointer(fdt->fd[fd], NULL);
708 __put_unused_fd(files, fd);
709 spin_unlock(&files->file_lock);
710 filp_close(file, files);
711 cond_resched();
712 spin_lock(&files->file_lock);
713 }
714
715 }
716 spin_unlock(&files->file_lock);
717}
718
719struct file *fget(unsigned int fd)
720{
721 struct file *file;
722 struct files_struct *files = current->files;
723
724 rcu_read_lock();
725 file = fcheck_files(files, fd);
726 if (file) {
727 /* File object ref couldn't be taken */
728 if (file->f_mode & FMODE_PATH ||
729 !atomic_long_inc_not_zero(&file->f_count))
730 file = NULL;
731 }
732 rcu_read_unlock();
733
734 return file;
735}
736
737EXPORT_SYMBOL(fget);
738
739struct file *fget_raw(unsigned int fd)
740{
741 struct file *file;
742 struct files_struct *files = current->files;
743
744 rcu_read_lock();
745 file = fcheck_files(files, fd);
746 if (file) {
747 /* File object ref couldn't be taken */
748 if (!atomic_long_inc_not_zero(&file->f_count))
749 file = NULL;
750 }
751 rcu_read_unlock();
752
753 return file;
754}
755
756EXPORT_SYMBOL(fget_raw);
757
758/*
759 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
760 *
761 * You can use this instead of fget if you satisfy all of the following
762 * conditions:
763 * 1) You must call fput_light before exiting the syscall and returning control
764 * to userspace (i.e. you cannot remember the returned struct file * after
765 * returning to userspace).
766 * 2) You must not call filp_close on the returned struct file * in between
767 * calls to fget_light and fput_light.
768 * 3) You must not clone the current task in between the calls to fget_light
769 * and fput_light.
770 *
771 * The fput_needed flag returned by fget_light should be passed to the
772 * corresponding fput_light.
773 */
774struct file *fget_light(unsigned int fd, int *fput_needed)
481{ 775{
482 return alloc_fd(0, 0); 776 struct file *file;
777 struct files_struct *files = current->files;
778
779 *fput_needed = 0;
780 if (atomic_read(&files->count) == 1) {
781 file = fcheck_files(files, fd);
782 if (file && (file->f_mode & FMODE_PATH))
783 file = NULL;
784 } else {
785 rcu_read_lock();
786 file = fcheck_files(files, fd);
787 if (file) {
788 if (!(file->f_mode & FMODE_PATH) &&
789 atomic_long_inc_not_zero(&file->f_count))
790 *fput_needed = 1;
791 else
792 /* Didn't get the reference, someone's freed */
793 file = NULL;
794 }
795 rcu_read_unlock();
796 }
797
798 return file;
799}
800EXPORT_SYMBOL(fget_light);
801
802struct file *fget_raw_light(unsigned int fd, int *fput_needed)
803{
804 struct file *file;
805 struct files_struct *files = current->files;
806
807 *fput_needed = 0;
808 if (atomic_read(&files->count) == 1) {
809 file = fcheck_files(files, fd);
810 } else {
811 rcu_read_lock();
812 file = fcheck_files(files, fd);
813 if (file) {
814 if (atomic_long_inc_not_zero(&file->f_count))
815 *fput_needed = 1;
816 else
817 /* Didn't get the reference, someone's freed */
818 file = NULL;
819 }
820 rcu_read_unlock();
821 }
822
823 return file;
824}
825
826void set_close_on_exec(unsigned int fd, int flag)
827{
828 struct files_struct *files = current->files;
829 struct fdtable *fdt;
830 spin_lock(&files->file_lock);
831 fdt = files_fdtable(files);
832 if (flag)
833 __set_close_on_exec(fd, fdt);
834 else
835 __clear_close_on_exec(fd, fdt);
836 spin_unlock(&files->file_lock);
837}
838
839bool get_close_on_exec(unsigned int fd)
840{
841 struct files_struct *files = current->files;
842 struct fdtable *fdt;
843 bool res;
844 rcu_read_lock();
845 fdt = files_fdtable(files);
846 res = close_on_exec(fd, fdt);
847 rcu_read_unlock();
848 return res;
849}
850
851static int do_dup2(struct files_struct *files,
852 struct file *file, unsigned fd, unsigned flags)
853{
854 struct file *tofree;
855 struct fdtable *fdt;
856
857 /*
858 * We need to detect attempts to do dup2() over allocated but still
859 * not finished descriptor. NB: OpenBSD avoids that at the price of
860 * extra work in their equivalent of fget() - they insert struct
861 * file immediately after grabbing descriptor, mark it larval if
862 * more work (e.g. actual opening) is needed and make sure that
863 * fget() treats larval files as absent. Potentially interesting,
864 * but while extra work in fget() is trivial, locking implications
865 * and amount of surgery on open()-related paths in VFS are not.
866 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
867 * deadlocks in rather amusing ways, AFAICS. All of that is out of
868 * scope of POSIX or SUS, since neither considers shared descriptor
869 * tables and this condition does not arise without those.
870 */
871 fdt = files_fdtable(files);
872 tofree = fdt->fd[fd];
873 if (!tofree && fd_is_open(fd, fdt))
874 goto Ebusy;
875 get_file(file);
876 rcu_assign_pointer(fdt->fd[fd], file);
877 __set_open_fd(fd, fdt);
878 if (flags & O_CLOEXEC)
879 __set_close_on_exec(fd, fdt);
880 else
881 __clear_close_on_exec(fd, fdt);
882 spin_unlock(&files->file_lock);
883
884 if (tofree)
885 filp_close(tofree, files);
886
887 return fd;
888
889Ebusy:
890 spin_unlock(&files->file_lock);
891 return -EBUSY;
892}
893
894int replace_fd(unsigned fd, struct file *file, unsigned flags)
895{
896 int err;
897 struct files_struct *files = current->files;
898
899 if (!file)
900 return __close_fd(files, fd);
901
902 if (fd >= rlimit(RLIMIT_NOFILE))
903 return -EMFILE;
904
905 spin_lock(&files->file_lock);
906 err = expand_files(files, fd);
907 if (unlikely(err < 0))
908 goto out_unlock;
909 return do_dup2(files, file, fd, flags);
910
911out_unlock:
912 spin_unlock(&files->file_lock);
913 return err;
914}
915
916SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
917{
918 int err = -EBADF;
919 struct file *file;
920 struct files_struct *files = current->files;
921
922 if ((flags & ~O_CLOEXEC) != 0)
923 return -EINVAL;
924
925 if (unlikely(oldfd == newfd))
926 return -EINVAL;
927
928 if (newfd >= rlimit(RLIMIT_NOFILE))
929 return -EMFILE;
930
931 spin_lock(&files->file_lock);
932 err = expand_files(files, newfd);
933 file = fcheck(oldfd);
934 if (unlikely(!file))
935 goto Ebadf;
936 if (unlikely(err < 0)) {
937 if (err == -EMFILE)
938 goto Ebadf;
939 goto out_unlock;
940 }
941 return do_dup2(files, file, newfd, flags);
942
943Ebadf:
944 err = -EBADF;
945out_unlock:
946 spin_unlock(&files->file_lock);
947 return err;
948}
949
950SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
951{
952 if (unlikely(newfd == oldfd)) { /* corner case */
953 struct files_struct *files = current->files;
954 int retval = oldfd;
955
956 rcu_read_lock();
957 if (!fcheck_files(files, oldfd))
958 retval = -EBADF;
959 rcu_read_unlock();
960 return retval;
961 }
962 return sys_dup3(oldfd, newfd, 0);
963}
964
965SYSCALL_DEFINE1(dup, unsigned int, fildes)
966{
967 int ret = -EBADF;
968 struct file *file = fget_raw(fildes);
969
970 if (file) {
971 ret = get_unused_fd();
972 if (ret >= 0)
973 fd_install(ret, file);
974 else
975 fput(file);
976 }
977 return ret;
978}
979
980int f_dupfd(unsigned int from, struct file *file, unsigned flags)
981{
982 int err;
983 if (from >= rlimit(RLIMIT_NOFILE))
984 return -EINVAL;
985 err = alloc_fd(from, flags);
986 if (err >= 0) {
987 get_file(file);
988 fd_install(err, file);
989 }
990 return err;
991}
992
993int iterate_fd(struct files_struct *files, unsigned n,
994 int (*f)(const void *, struct file *, unsigned),
995 const void *p)
996{
997 struct fdtable *fdt;
998 struct file *file;
999 int res = 0;
1000 if (!files)
1001 return 0;
1002 spin_lock(&files->file_lock);
1003 fdt = files_fdtable(files);
1004 while (!res && n < fdt->max_fds) {
1005 file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
1006 if (file)
1007 res = f(p, file, n);
1008 }
1009 spin_unlock(&files->file_lock);
1010 return res;
483} 1011}
484EXPORT_SYMBOL(get_unused_fd); 1012EXPORT_SYMBOL(iterate_fd);
diff --git a/fs/file_table.c b/fs/file_table.c
index 701985e4ccda..a72bf9ddd0d2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,7 +36,7 @@ struct files_stat_struct files_stat = {
36 .max_files = NR_FILE 36 .max_files = NR_FILE
37}; 37};
38 38
39DEFINE_LGLOCK(files_lglock); 39DEFINE_STATIC_LGLOCK(files_lglock);
40 40
41/* SLAB cache for file structures */ 41/* SLAB cache for file structures */
42static struct kmem_cache *filp_cachep __read_mostly; 42static struct kmem_cache *filp_cachep __read_mostly;
@@ -243,10 +243,10 @@ static void __fput(struct file *file)
243 if (file->f_op && file->f_op->fasync) 243 if (file->f_op && file->f_op->fasync)
244 file->f_op->fasync(-1, file, 0); 244 file->f_op->fasync(-1, file, 0);
245 } 245 }
246 ima_file_free(file);
246 if (file->f_op && file->f_op->release) 247 if (file->f_op && file->f_op->release)
247 file->f_op->release(inode, file); 248 file->f_op->release(inode, file);
248 security_file_free(file); 249 security_file_free(file);
249 ima_file_free(file);
250 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && 250 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
251 !(file->f_mode & FMODE_PATH))) { 251 !(file->f_mode & FMODE_PATH))) {
252 cdev_put(inode->i_cdev); 252 cdev_put(inode->i_cdev);
@@ -339,112 +339,6 @@ void __fput_sync(struct file *file)
339 339
340EXPORT_SYMBOL(fput); 340EXPORT_SYMBOL(fput);
341 341
342struct file *fget(unsigned int fd)
343{
344 struct file *file;
345 struct files_struct *files = current->files;
346
347 rcu_read_lock();
348 file = fcheck_files(files, fd);
349 if (file) {
350 /* File object ref couldn't be taken */
351 if (file->f_mode & FMODE_PATH ||
352 !atomic_long_inc_not_zero(&file->f_count))
353 file = NULL;
354 }
355 rcu_read_unlock();
356
357 return file;
358}
359
360EXPORT_SYMBOL(fget);
361
362struct file *fget_raw(unsigned int fd)
363{
364 struct file *file;
365 struct files_struct *files = current->files;
366
367 rcu_read_lock();
368 file = fcheck_files(files, fd);
369 if (file) {
370 /* File object ref couldn't be taken */
371 if (!atomic_long_inc_not_zero(&file->f_count))
372 file = NULL;
373 }
374 rcu_read_unlock();
375
376 return file;
377}
378
379EXPORT_SYMBOL(fget_raw);
380
381/*
382 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
383 *
384 * You can use this instead of fget if you satisfy all of the following
385 * conditions:
386 * 1) You must call fput_light before exiting the syscall and returning control
387 * to userspace (i.e. you cannot remember the returned struct file * after
388 * returning to userspace).
389 * 2) You must not call filp_close on the returned struct file * in between
390 * calls to fget_light and fput_light.
391 * 3) You must not clone the current task in between the calls to fget_light
392 * and fput_light.
393 *
394 * The fput_needed flag returned by fget_light should be passed to the
395 * corresponding fput_light.
396 */
397struct file *fget_light(unsigned int fd, int *fput_needed)
398{
399 struct file *file;
400 struct files_struct *files = current->files;
401
402 *fput_needed = 0;
403 if (atomic_read(&files->count) == 1) {
404 file = fcheck_files(files, fd);
405 if (file && (file->f_mode & FMODE_PATH))
406 file = NULL;
407 } else {
408 rcu_read_lock();
409 file = fcheck_files(files, fd);
410 if (file) {
411 if (!(file->f_mode & FMODE_PATH) &&
412 atomic_long_inc_not_zero(&file->f_count))
413 *fput_needed = 1;
414 else
415 /* Didn't get the reference, someone's freed */
416 file = NULL;
417 }
418 rcu_read_unlock();
419 }
420
421 return file;
422}
423
424struct file *fget_raw_light(unsigned int fd, int *fput_needed)
425{
426 struct file *file;
427 struct files_struct *files = current->files;
428
429 *fput_needed = 0;
430 if (atomic_read(&files->count) == 1) {
431 file = fcheck_files(files, fd);
432 } else {
433 rcu_read_lock();
434 file = fcheck_files(files, fd);
435 if (file) {
436 if (atomic_long_inc_not_zero(&file->f_count))
437 *fput_needed = 1;
438 else
439 /* Didn't get the reference, someone's freed */
440 file = NULL;
441 }
442 rcu_read_unlock();
443 }
444
445 return file;
446}
447
448void put_filp(struct file *file) 342void put_filp(struct file *file)
449{ 343{
450 if (atomic_long_dec_and_test(&file->f_count)) { 344 if (atomic_long_dec_and_test(&file->f_count)) {
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 96f24286667a..da165f6adcbf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(unregister_filesystem);
124static int fs_index(const char __user * __name) 124static int fs_index(const char __user * __name)
125{ 125{
126 struct file_system_type * tmp; 126 struct file_system_type * tmp;
127 char * name; 127 struct filename *name;
128 int err, index; 128 int err, index;
129 129
130 name = getname(__name); 130 name = getname(__name);
@@ -135,7 +135,7 @@ static int fs_index(const char __user * __name)
135 err = -EINVAL; 135 err = -EINVAL;
136 read_lock(&file_systems_lock); 136 read_lock(&file_systems_lock);
137 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { 137 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
138 if (strcmp(tmp->name,name) == 0) { 138 if (strcmp(tmp->name, name->name) == 0) {
139 err = index; 139 err = index;
140 break; 140 break;
141 } 141 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef67c95f12d4..f47df72cef17 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -224,8 +224,8 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
224{ 224{
225 225
226 ip->i_mode = vxfs_transmod(vip); 226 ip->i_mode = vxfs_transmod(vip);
227 ip->i_uid = (uid_t)vip->vii_uid; 227 i_uid_write(ip, (uid_t)vip->vii_uid);
228 ip->i_gid = (gid_t)vip->vii_gid; 228 i_gid_write(ip, (gid_t)vip->vii_gid);
229 229
230 set_nlink(ip, vip->vii_nlink); 230 set_nlink(ip, vip->vii_nlink);
231 ip->i_size = vip->vii_size; 231 ip->i_size = vip->vii_size;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d4fabd26084e..fed2c8afb3a9 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -279,6 +279,11 @@ static void __exit
279vxfs_cleanup(void) 279vxfs_cleanup(void)
280{ 280{
281 unregister_filesystem(&vxfs_fs_type); 281 unregister_filesystem(&vxfs_fs_type);
282 /*
283 * Make sure all delayed rcu free inodes are flushed before we
284 * destroy cache.
285 */
286 rcu_barrier();
282 kmem_cache_destroy(vxfs_inode_cachep); 287 kmem_cache_destroy(vxfs_inode_cachep);
283} 288}
284 289
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be3efc4f64f4..51ea267d444c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
63{ 63{
64 return test_bit(BDI_writeback_running, &bdi->state); 64 return test_bit(BDI_writeback_running, &bdi->state);
65} 65}
66EXPORT_SYMBOL(writeback_in_progress);
66 67
67static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 68static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
68{ 69{
@@ -248,7 +249,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
248} 249}
249 250
250/* 251/*
251 * Move expired (dirtied after work->older_than_this) dirty inodes from 252 * Move expired (dirtied before work->older_than_this) dirty inodes from
252 * @delaying_queue to @dispatch_queue. 253 * @delaying_queue to @dispatch_queue.
253 */ 254 */
254static int move_expired_inodes(struct list_head *delaying_queue, 255static int move_expired_inodes(struct list_head *delaying_queue,
@@ -438,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
438 * setting I_SYNC flag and calling inode_sync_complete() to clear it. 439 * setting I_SYNC flag and calling inode_sync_complete() to clear it.
439 */ 440 */
440static int 441static int
441__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 442__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
442 struct writeback_control *wbc)
443{ 443{
444 struct address_space *mapping = inode->i_mapping; 444 struct address_space *mapping = inode->i_mapping;
445 long nr_to_write = wbc->nr_to_write; 445 long nr_to_write = wbc->nr_to_write;
@@ -526,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
526 inode->i_state |= I_SYNC; 526 inode->i_state |= I_SYNC;
527 spin_unlock(&inode->i_lock); 527 spin_unlock(&inode->i_lock);
528 528
529 ret = __writeback_single_inode(inode, wb, wbc); 529 ret = __writeback_single_inode(inode, wbc);
530 530
531 spin_lock(&wb->list_lock); 531 spin_lock(&wb->list_lock);
532 spin_lock(&inode->i_lock); 532 spin_lock(&inode->i_lock);
@@ -577,10 +577,6 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
577/* 577/*
578 * Write a portion of b_io inodes which belong to @sb. 578 * Write a portion of b_io inodes which belong to @sb.
579 * 579 *
580 * If @only_this_sb is true, then find and write all such
581 * inodes. Otherwise write only ones which go sequentially
582 * in reverse order.
583 *
584 * Return the number of pages and/or inodes written. 580 * Return the number of pages and/or inodes written.
585 */ 581 */
586static long writeback_sb_inodes(struct super_block *sb, 582static long writeback_sb_inodes(struct super_block *sb,
@@ -673,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb,
673 * We use I_SYNC to pin the inode in memory. While it is set 669 * We use I_SYNC to pin the inode in memory. While it is set
674 * evict_inode() will wait so the inode cannot be freed. 670 * evict_inode() will wait so the inode cannot be freed.
675 */ 671 */
676 __writeback_single_inode(inode, wb, &wbc); 672 __writeback_single_inode(inode, &wbc);
677 673
678 work->nr_pages -= write_chunk - wbc.nr_to_write; 674 work->nr_pages -= write_chunk - wbc.nr_to_write;
679 wrote += write_chunk - wbc.nr_to_write; 675 wrote += write_chunk - wbc.nr_to_write;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 03ff5b1eba93..75a20c092dd4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -117,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
117 const char __user *buf, 117 const char __user *buf,
118 size_t count, loff_t *ppos) 118 size_t count, loff_t *ppos)
119{ 119{
120 unsigned val; 120 unsigned uninitialized_var(val);
121 ssize_t ret; 121 ssize_t ret;
122 122
123 ret = fuse_conn_limit_write(file, buf, count, ppos, &val, 123 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -154,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
154 const char __user *buf, 154 const char __user *buf,
155 size_t count, loff_t *ppos) 155 size_t count, loff_t *ppos)
156{ 156{
157 unsigned val; 157 unsigned uninitialized_var(val);
158 ssize_t ret; 158 ssize_t ret;
159 159
160 ret = fuse_conn_limit_write(file, buf, count, ppos, &val, 160 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3426521f3205..ee8d55042298 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -396,7 +396,7 @@ err_device:
396err_region: 396err_region:
397 unregister_chrdev_region(devt, 1); 397 unregister_chrdev_region(devt, 1);
398err: 398err:
399 fc->conn_error = 1; 399 fuse_conn_kill(fc);
400 goto out; 400 goto out;
401} 401}
402 402
@@ -532,8 +532,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
532 cdev_del(cc->cdev); 532 cdev_del(cc->cdev);
533 } 533 }
534 534
535 /* kill connection and shutdown channel */
536 fuse_conn_kill(&cc->fc);
537 rc = fuse_dev_release(inode, file); /* puts the base reference */ 535 rc = fuse_dev_release(inode, file); /* puts the base reference */
538 536
539 return rc; 537 return rc;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7df2b5e8fbe1..8c23fa7a91e6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,8 +148,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
148 if (ff->reserved_req) { 148 if (ff->reserved_req) {
149 req = ff->reserved_req; 149 req = ff->reserved_req;
150 ff->reserved_req = NULL; 150 ff->reserved_req = NULL;
151 get_file(file); 151 req->stolen_file = get_file(file);
152 req->stolen_file = file;
153 } 152 }
154 spin_unlock(&fc->lock); 153 spin_unlock(&fc->lock);
155 } while (!req); 154 } while (!req);
@@ -1576,6 +1575,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1576 req->pages[req->num_pages] = page; 1575 req->pages[req->num_pages] = page;
1577 req->num_pages++; 1576 req->num_pages++;
1578 1577
1578 offset = 0;
1579 num -= this_num; 1579 num -= this_num;
1580 total_len += this_num; 1580 total_len += this_num;
1581 index++; 1581 index++;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aba15f1b7ad2..78d2837bc940 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
1379 .close = fuse_vma_close, 1379 .close = fuse_vma_close,
1380 .fault = filemap_fault, 1380 .fault = filemap_fault,
1381 .page_mkwrite = fuse_page_mkwrite, 1381 .page_mkwrite = fuse_page_mkwrite,
1382 .remap_pages = generic_file_remap_pages,
1382}; 1383};
1383 1384
1384static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 1385static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ce0a2838ccd0..f0eda124cffb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -367,11 +367,6 @@ void fuse_conn_kill(struct fuse_conn *fc)
367 wake_up_all(&fc->waitq); 367 wake_up_all(&fc->waitq);
368 wake_up_all(&fc->blocked_waitq); 368 wake_up_all(&fc->blocked_waitq);
369 wake_up_all(&fc->reserved_req_waitq); 369 wake_up_all(&fc->reserved_req_waitq);
370 mutex_lock(&fuse_mutex);
371 list_del(&fc->entry);
372 fuse_ctl_remove_conn(fc);
373 mutex_unlock(&fuse_mutex);
374 fuse_bdi_destroy(fc);
375} 370}
376EXPORT_SYMBOL_GPL(fuse_conn_kill); 371EXPORT_SYMBOL_GPL(fuse_conn_kill);
377 372
@@ -380,7 +375,14 @@ static void fuse_put_super(struct super_block *sb)
380 struct fuse_conn *fc = get_fuse_conn_super(sb); 375 struct fuse_conn *fc = get_fuse_conn_super(sb);
381 376
382 fuse_send_destroy(fc); 377 fuse_send_destroy(fc);
378
383 fuse_conn_kill(fc); 379 fuse_conn_kill(fc);
380 mutex_lock(&fuse_mutex);
381 list_del(&fc->entry);
382 fuse_ctl_remove_conn(fc);
383 mutex_unlock(&fuse_mutex);
384 fuse_bdi_destroy(fc);
385
384 fuse_conn_put(fc); 386 fuse_conn_put(fc);
385} 387}
386 388
@@ -1195,6 +1197,12 @@ static void fuse_fs_cleanup(void)
1195{ 1197{
1196 unregister_filesystem(&fuse_fs_type); 1198 unregister_filesystem(&fuse_fs_type);
1197 unregister_fuseblk(); 1199 unregister_fuseblk();
1200
1201 /*
1202 * Make sure all delayed rcu free inodes are flushed before we
1203 * destroy cache.
1204 */
1205 rcu_barrier();
1198 kmem_cache_destroy(fuse_inode_cachep); 1206 kmem_cache_destroy(fuse_inode_cachep);
1199} 1207}
1200 1208
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index d0dddaceac59..b3f3676796d3 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -56,7 +56,7 @@ generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
56 acl = get_cached_acl(dentry->d_inode, type); 56 acl = get_cached_acl(dentry->d_inode, type);
57 if (!acl) 57 if (!acl)
58 return -ENODATA; 58 return -ENODATA;
59 error = posix_acl_to_xattr(acl, buffer, size); 59 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
60 posix_acl_release(acl); 60 posix_acl_release(acl);
61 61
62 return error; 62 return error;
@@ -77,7 +77,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
77 if (!inode_owner_or_capable(inode)) 77 if (!inode_owner_or_capable(inode))
78 return -EPERM; 78 return -EPERM;
79 if (value) { 79 if (value) {
80 acl = posix_acl_from_xattr(value, size); 80 acl = posix_acl_from_xattr(&init_user_ns, value, size);
81 if (IS_ERR(acl)) 81 if (IS_ERR(acl))
82 return PTR_ERR(acl); 82 return PTR_ERR(acl);
83 } 83 }
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index bd4a5892c93c..f850020ad906 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -63,7 +63,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
63 if (len == 0) 63 if (len == 0)
64 return NULL; 64 return NULL;
65 65
66 acl = posix_acl_from_xattr(data, len); 66 acl = posix_acl_from_xattr(&init_user_ns, data, len);
67 kfree(data); 67 kfree(data);
68 return acl; 68 return acl;
69} 69}
@@ -88,13 +88,13 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
88 const char *name = gfs2_acl_name(type); 88 const char *name = gfs2_acl_name(type);
89 89
90 BUG_ON(name == NULL); 90 BUG_ON(name == NULL);
91 len = posix_acl_to_xattr(acl, NULL, 0); 91 len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
92 if (len == 0) 92 if (len == 0)
93 return 0; 93 return 0;
94 data = kmalloc(len, GFP_NOFS); 94 data = kmalloc(len, GFP_NOFS);
95 if (data == NULL) 95 if (data == NULL)
96 return -ENOMEM; 96 return -ENOMEM;
97 error = posix_acl_to_xattr(acl, data, len); 97 error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
98 if (error < 0) 98 if (error < 0)
99 goto out; 99 goto out;
100 error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); 100 error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
@@ -166,12 +166,12 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
166 if (error) 166 if (error)
167 return error; 167 return error;
168 168
169 len = posix_acl_to_xattr(acl, NULL, 0); 169 len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
170 data = kmalloc(len, GFP_NOFS); 170 data = kmalloc(len, GFP_NOFS);
171 error = -ENOMEM; 171 error = -ENOMEM;
172 if (data == NULL) 172 if (data == NULL)
173 goto out; 173 goto out;
174 posix_acl_to_xattr(acl, data, len); 174 posix_acl_to_xattr(&init_user_ns, acl, data, len);
175 error = gfs2_xattr_acl_chmod(ip, attr, data); 175 error = gfs2_xattr_acl_chmod(ip, attr, data);
176 kfree(data); 176 kfree(data);
177 set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); 177 set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
@@ -212,7 +212,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
212 if (acl == NULL) 212 if (acl == NULL)
213 return -ENODATA; 213 return -ENODATA;
214 214
215 error = posix_acl_to_xattr(acl, buffer, size); 215 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
216 posix_acl_release(acl); 216 posix_acl_release(acl);
217 217
218 return error; 218 return error;
@@ -245,7 +245,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
245 if (!value) 245 if (!value)
246 goto set_acl; 246 goto set_acl;
247 247
248 acl = posix_acl_from_xattr(value, size); 248 acl = posix_acl_from_xattr(&init_user_ns, value, size);
249 if (!acl) { 249 if (!acl) {
250 /* 250 /*
251 * acl_set_file(3) may request that we set default ACLs with 251 * acl_set_file(3) may request that we set default ACLs with
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index d6526347d386..01c4975da4bc 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -612,6 +612,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
612 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 612 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
613 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); 613 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
614 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 614 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
615 unsigned requested = 0;
615 int alloc_required; 616 int alloc_required;
616 int error = 0; 617 int error = 0;
617 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 618 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
@@ -641,7 +642,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
641 if (error) 642 if (error)
642 goto out_unlock; 643 goto out_unlock;
643 644
644 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); 645 requested = data_blocks + ind_blocks;
646 error = gfs2_inplace_reserve(ip, requested);
645 if (error) 647 if (error)
646 goto out_qunlock; 648 goto out_qunlock;
647 } 649 }
@@ -654,7 +656,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
654 if (&ip->i_inode == sdp->sd_rindex) 656 if (&ip->i_inode == sdp->sd_rindex)
655 rblocks += 2 * RES_STATFS; 657 rblocks += 2 * RES_STATFS;
656 if (alloc_required) 658 if (alloc_required)
657 rblocks += gfs2_rg_blocks(ip); 659 rblocks += gfs2_rg_blocks(ip, requested);
658 660
659 error = gfs2_trans_begin(sdp, rblocks, 661 error = gfs2_trans_begin(sdp, rblocks,
660 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 662 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -868,8 +870,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
868 brelse(dibh); 870 brelse(dibh);
869failed: 871failed:
870 gfs2_trans_end(sdp); 872 gfs2_trans_end(sdp);
871 if (gfs2_mb_reserved(ip)) 873 gfs2_inplace_release(ip);
872 gfs2_inplace_release(ip);
873 if (ip->i_res->rs_qa_qd_num) 874 if (ip->i_res->rs_qa_qd_num)
874 gfs2_quota_unlock(ip); 875 gfs2_quota_unlock(ip);
875 if (inode == sdp->sd_rindex) { 876 if (inode == sdp->sd_rindex) {
@@ -1023,7 +1024,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1023 offset, nr_segs, gfs2_get_block_direct, 1024 offset, nr_segs, gfs2_get_block_direct,
1024 NULL, NULL, 0); 1025 NULL, NULL, 0);
1025out: 1026out:
1026 gfs2_glock_dq_m(1, &gh); 1027 gfs2_glock_dq(&gh);
1027 gfs2_holder_uninit(&gh); 1028 gfs2_holder_uninit(&gh);
1028 return rv; 1029 return rv;
1029} 1030}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 49cd7dd4a9fa..1fd3ae237bdd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -786,7 +786,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
786 goto out_rlist; 786 goto out_rlist;
787 787
788 if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ 788 if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
789 gfs2_rs_deltree(ip->i_res); 789 gfs2_rs_deltree(ip, ip->i_res);
790 790
791 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + 791 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
792 RES_INDIRECT + RES_STATFS + RES_QUOTA, 792 RES_INDIRECT + RES_STATFS + RES_QUOTA,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index e8ed6d4a6181..4767774a5f3e 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -161,6 +161,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
161 case GFS2_SMALL_FH_SIZE: 161 case GFS2_SMALL_FH_SIZE:
162 case GFS2_LARGE_FH_SIZE: 162 case GFS2_LARGE_FH_SIZE:
163 case GFS2_OLD_FH_SIZE: 163 case GFS2_OLD_FH_SIZE:
164 if (fh_len < GFS2_SMALL_FH_SIZE)
165 return NULL;
164 this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; 166 this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
165 this.no_formal_ino |= be32_to_cpu(fh[1]); 167 this.no_formal_ino |= be32_to_cpu(fh[1]);
166 this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; 168 this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
@@ -180,6 +182,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
180 switch (fh_type) { 182 switch (fh_type) {
181 case GFS2_LARGE_FH_SIZE: 183 case GFS2_LARGE_FH_SIZE:
182 case GFS2_OLD_FH_SIZE: 184 case GFS2_OLD_FH_SIZE:
185 if (fh_len < GFS2_LARGE_FH_SIZE)
186 return NULL;
183 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; 187 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
184 parent.no_formal_ino |= be32_to_cpu(fh[5]); 188 parent.no_formal_ino |= be32_to_cpu(fh[5]);
185 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; 189 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d1d791ef38de..0def0504afc1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -323,6 +323,29 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
323} 323}
324 324
325/** 325/**
326 * gfs2_size_hint - Give a hint to the size of a write request
327 * @file: The struct file
328 * @offset: The file offset of the write
329 * @size: The length of the write
330 *
331 * When we are about to do a write, this function records the total
332 * write size in order to provide a suitable hint to the lower layers
333 * about how many blocks will be required.
334 *
335 */
336
337static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
338{
339 struct inode *inode = filep->f_dentry->d_inode;
340 struct gfs2_sbd *sdp = GFS2_SB(inode);
341 struct gfs2_inode *ip = GFS2_I(inode);
342 size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
343 int hint = min_t(size_t, INT_MAX, blks);
344
345 atomic_set(&ip->i_res->rs_sizehint, hint);
346}
347
348/**
326 * gfs2_allocate_page_backing - Use bmap to allocate blocks 349 * gfs2_allocate_page_backing - Use bmap to allocate blocks
327 * @page: The (locked) page to allocate backing for 350 * @page: The (locked) page to allocate backing for
328 * 351 *
@@ -382,8 +405,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 if (ret) 405 if (ret)
383 return ret; 406 return ret;
384 407
385 atomic_set(&ip->i_res->rs_sizehint, 408 gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
386 PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift);
387 409
388 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 410 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
389 ret = gfs2_glock_nq(&gh); 411 ret = gfs2_glock_nq(&gh);
@@ -419,7 +441,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
419 rblocks += data_blocks ? data_blocks : 1; 441 rblocks += data_blocks ? data_blocks : 1;
420 if (ind_blocks || data_blocks) { 442 if (ind_blocks || data_blocks) {
421 rblocks += RES_STATFS + RES_QUOTA; 443 rblocks += RES_STATFS + RES_QUOTA;
422 rblocks += gfs2_rg_blocks(ip); 444 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
423 } 445 }
424 ret = gfs2_trans_begin(sdp, rblocks, 0); 446 ret = gfs2_trans_begin(sdp, rblocks, 0);
425 if (ret) 447 if (ret)
@@ -470,6 +492,7 @@ out:
470static const struct vm_operations_struct gfs2_vm_ops = { 492static const struct vm_operations_struct gfs2_vm_ops = {
471 .fault = filemap_fault, 493 .fault = filemap_fault,
472 .page_mkwrite = gfs2_page_mkwrite, 494 .page_mkwrite = gfs2_page_mkwrite,
495 .remap_pages = generic_file_remap_pages,
473}; 496};
474 497
475/** 498/**
@@ -504,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
504 return error; 527 return error;
505 } 528 }
506 vma->vm_ops = &gfs2_vm_ops; 529 vma->vm_ops = &gfs2_vm_ops;
507 vma->vm_flags |= VM_CAN_NONLINEAR;
508 530
509 return 0; 531 return 0;
510} 532}
@@ -663,7 +685,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
663 if (ret) 685 if (ret)
664 return ret; 686 return ret;
665 687
666 atomic_set(&ip->i_res->rs_sizehint, writesize >> sdp->sd_sb.sb_bsize_shift); 688 gfs2_size_hint(file, pos, writesize);
689
667 if (file->f_flags & O_APPEND) { 690 if (file->f_flags & O_APPEND) {
668 struct gfs2_holder gh; 691 struct gfs2_holder gh;
669 692
@@ -789,7 +812,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
789 if (unlikely(error)) 812 if (unlikely(error))
790 goto out_uninit; 813 goto out_uninit;
791 814
792 atomic_set(&ip->i_res->rs_sizehint, len >> sdp->sd_sb.sb_bsize_shift); 815 gfs2_size_hint(file, offset, len);
793 816
794 while (len > 0) { 817 while (len > 0) {
795 if (len < bytes) 818 if (len < bytes)
@@ -822,7 +845,7 @@ retry:
822 &max_bytes, &data_blocks, &ind_blocks); 845 &max_bytes, &data_blocks, &ind_blocks);
823 846
824 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + 847 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
825 RES_RG_HDR + gfs2_rg_blocks(ip); 848 RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
826 if (gfs2_is_jdata(ip)) 849 if (gfs2_is_jdata(ip))
827 rblocks += data_blocks ? data_blocks : 1; 850 rblocks += data_blocks ? data_blocks : 1;
828 851
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 1ed81f40da0d..e6c2fd53cab2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -186,20 +186,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
186} 186}
187 187
188/** 188/**
189 * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
190 * @gl: the glock
191 *
192 * If the glock is demotable, then we add it (or move it) to the end
193 * of the glock LRU list.
194 */
195
196static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
197{
198 if (demote_ok(gl))
199 gfs2_glock_add_to_lru(gl);
200}
201
202/**
203 * gfs2_glock_put_nolock() - Decrement reference count on glock 189 * gfs2_glock_put_nolock() - Decrement reference count on glock
204 * @gl: The glock to put 190 * @gl: The glock to put
205 * 191 *
@@ -883,7 +869,14 @@ static int gfs2_glock_demote_wait(void *word)
883 return 0; 869 return 0;
884} 870}
885 871
886static void wait_on_holder(struct gfs2_holder *gh) 872/**
873 * gfs2_glock_wait - wait on a glock acquisition
874 * @gh: the glock holder
875 *
876 * Returns: 0 on success
877 */
878
879int gfs2_glock_wait(struct gfs2_holder *gh)
887{ 880{
888 unsigned long time1 = jiffies; 881 unsigned long time1 = jiffies;
889 882
@@ -894,12 +887,7 @@ static void wait_on_holder(struct gfs2_holder *gh)
894 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + 887 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
895 GL_GLOCK_HOLD_INCR, 888 GL_GLOCK_HOLD_INCR,
896 GL_GLOCK_MAX_HOLD); 889 GL_GLOCK_MAX_HOLD);
897} 890 return gh->gh_error;
898
899static void wait_on_demote(struct gfs2_glock *gl)
900{
901 might_sleep();
902 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
903} 891}
904 892
905/** 893/**
@@ -929,19 +917,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
929 trace_gfs2_demote_rq(gl); 917 trace_gfs2_demote_rq(gl);
930} 918}
931 919
932/**
933 * gfs2_glock_wait - wait on a glock acquisition
934 * @gh: the glock holder
935 *
936 * Returns: 0 on success
937 */
938
939int gfs2_glock_wait(struct gfs2_holder *gh)
940{
941 wait_on_holder(gh);
942 return gh->gh_error;
943}
944
945void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) 920void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
946{ 921{
947 struct va_format vaf; 922 struct va_format vaf;
@@ -979,7 +954,7 @@ __acquires(&gl->gl_spin)
979 struct gfs2_sbd *sdp = gl->gl_sbd; 954 struct gfs2_sbd *sdp = gl->gl_sbd;
980 struct list_head *insert_pt = NULL; 955 struct list_head *insert_pt = NULL;
981 struct gfs2_holder *gh2; 956 struct gfs2_holder *gh2;
982 int try_lock = 0; 957 int try_futile = 0;
983 958
984 BUG_ON(gh->gh_owner_pid == NULL); 959 BUG_ON(gh->gh_owner_pid == NULL);
985 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 960 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
@@ -987,7 +962,7 @@ __acquires(&gl->gl_spin)
987 962
988 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { 963 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
989 if (test_bit(GLF_LOCK, &gl->gl_flags)) 964 if (test_bit(GLF_LOCK, &gl->gl_flags))
990 try_lock = 1; 965 try_futile = !may_grant(gl, gh);
991 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) 966 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
992 goto fail; 967 goto fail;
993 } 968 }
@@ -996,9 +971,8 @@ __acquires(&gl->gl_spin)
996 if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && 971 if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
997 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) 972 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
998 goto trap_recursive; 973 goto trap_recursive;
999 if (try_lock && 974 if (try_futile &&
1000 !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && 975 !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
1001 !may_grant(gl, gh)) {
1002fail: 976fail:
1003 gh->gh_error = GLR_TRYFAILED; 977 gh->gh_error = GLR_TRYFAILED;
1004 gfs2_holder_wake(gh); 978 gfs2_holder_wake(gh);
@@ -1121,8 +1095,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1121 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1095 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1122 fast_path = 1; 1096 fast_path = 1;
1123 } 1097 }
1124 if (!test_bit(GLF_LFLUSH, &gl->gl_flags)) 1098 if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
1125 __gfs2_glock_schedule_for_reclaim(gl); 1099 gfs2_glock_add_to_lru(gl);
1100
1126 trace_gfs2_glock_queue(gh, 0); 1101 trace_gfs2_glock_queue(gh, 0);
1127 spin_unlock(&gl->gl_spin); 1102 spin_unlock(&gl->gl_spin);
1128 if (likely(fast_path)) 1103 if (likely(fast_path))
@@ -1141,7 +1116,8 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh)
1141{ 1116{
1142 struct gfs2_glock *gl = gh->gh_gl; 1117 struct gfs2_glock *gl = gh->gh_gl;
1143 gfs2_glock_dq(gh); 1118 gfs2_glock_dq(gh);
1144 wait_on_demote(gl); 1119 might_sleep();
1120 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
1145} 1121}
1146 1122
1147/** 1123/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4bdcf3784187..32cc4fde975c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -94,6 +94,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
94 /* A shortened, inline version of gfs2_trans_begin() */ 94 /* A shortened, inline version of gfs2_trans_begin() */
95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); 95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
96 tr.tr_ip = (unsigned long)__builtin_return_address(0); 96 tr.tr_ip = (unsigned long)__builtin_return_address(0);
97 sb_start_intwrite(sdp->sd_vfs);
97 gfs2_log_reserve(sdp, tr.tr_reserved); 98 gfs2_log_reserve(sdp, tr.tr_reserved);
98 BUG_ON(current->journal_info); 99 BUG_ON(current->journal_info);
99 current->journal_info = &tr; 100 current->journal_info = &tr;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index aaecc8085fc5..3d469d37345e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -99,9 +99,26 @@ struct gfs2_rgrpd {
99#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ 99#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
100 spinlock_t rd_rsspin; /* protects reservation related vars */ 100 spinlock_t rd_rsspin; /* protects reservation related vars */
101 struct rb_root rd_rstree; /* multi-block reservation tree */ 101 struct rb_root rd_rstree; /* multi-block reservation tree */
102 u32 rd_rs_cnt; /* count of current reservations */
103}; 102};
104 103
104struct gfs2_rbm {
105 struct gfs2_rgrpd *rgd;
106 struct gfs2_bitmap *bi; /* Bitmap must belong to the rgd */
107 u32 offset; /* The offset is bitmap relative */
108};
109
110static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
111{
112 return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset;
113}
114
115static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
116 const struct gfs2_rbm *rbm2)
117{
118 return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) &&
119 (rbm1->offset == rbm2->offset);
120}
121
105enum gfs2_state_bits { 122enum gfs2_state_bits {
106 BH_Pinned = BH_PrivateStart, 123 BH_Pinned = BH_PrivateStart,
107 BH_Escaped = BH_PrivateStart + 1, 124 BH_Escaped = BH_PrivateStart + 1,
@@ -250,18 +267,11 @@ struct gfs2_blkreserv {
250 /* components used during write (step 1): */ 267 /* components used during write (step 1): */
251 atomic_t rs_sizehint; /* hint of the write size */ 268 atomic_t rs_sizehint; /* hint of the write size */
252 269
253 /* components used during inplace_reserve (step 2): */
254 u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
255
256 /* components used during get_local_rgrp (step 3): */
257 struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */
258 struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ 270 struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
259 struct rb_node rs_node; /* link to other block reservations */ 271 struct rb_node rs_node; /* link to other block reservations */
260 272 struct gfs2_rbm rs_rbm; /* Start of reservation */
261 /* components used during block searches and assignments (step 4): */
262 struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */
263 u32 rs_biblk; /* start block relative to the bi */
264 u32 rs_free; /* how many blocks are still free */ 273 u32 rs_free; /* how many blocks are still free */
274 u64 rs_inum; /* Inode number for reservation */
265 275
266 /* ancillary quota stuff */ 276 /* ancillary quota stuff */
267 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; 277 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4ce22e547308..381893ceefa4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -712,14 +712,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
712 if (error) 712 if (error)
713 goto fail_gunlock2; 713 goto fail_gunlock2;
714 714
715 /* The newly created inode needs a reservation so it can allocate 715 error = gfs2_rs_alloc(ip);
716 xattrs. At the same time, we want new blocks allocated to the new 716 if (error)
717 dinode to be as contiguous as possible. Since we allocated the 717 goto fail_gunlock2;
718 dinode block under the directory's reservation, we transfer
719 ownership of that reservation to the new inode. The directory
720 doesn't need a reservation unless it needs a new allocation. */
721 ip->i_res = dip->i_res;
722 dip->i_res = NULL;
723 718
724 error = gfs2_acl_create(dip, inode); 719 error = gfs2_acl_create(dip, inode);
725 if (error) 720 if (error)
@@ -737,10 +732,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
737 brelse(bh); 732 brelse(bh);
738 733
739 gfs2_trans_end(sdp); 734 gfs2_trans_end(sdp);
740 /* Check if we reserved space in the rgrp. Function link_dinode may 735 gfs2_inplace_release(dip);
741 not, depending on whether alloc is required. */
742 if (gfs2_mb_reserved(dip))
743 gfs2_inplace_release(dip);
744 gfs2_quota_unlock(dip); 736 gfs2_quota_unlock(dip);
745 mark_inode_dirty(inode); 737 mark_inode_dirty(inode);
746 gfs2_glock_dq_uninit_m(2, ghs); 738 gfs2_glock_dq_uninit_m(2, ghs);
@@ -897,7 +889,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
897 goto out_gunlock_q; 889 goto out_gunlock_q;
898 890
899 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 891 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
900 gfs2_rg_blocks(dip) + 892 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
901 2 * RES_DINODE + RES_STATFS + 893 2 * RES_DINODE + RES_STATFS +
902 RES_QUOTA, 0); 894 RES_QUOTA, 0);
903 if (error) 895 if (error)
@@ -1378,7 +1370,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1378 goto out_gunlock_q; 1370 goto out_gunlock_q;
1379 1371
1380 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1372 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
1381 gfs2_rg_blocks(ndip) + 1373 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
1382 4 * RES_DINODE + 4 * RES_LEAF + 1374 4 * RES_DINODE + 4 * RES_LEAF +
1383 RES_STATFS + RES_QUOTA + 4, 0); 1375 RES_STATFS + RES_QUOTA + 4, 0);
1384 if (error) 1376 if (error)
@@ -1722,7 +1714,9 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
1722 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1714 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1723 ret = gfs2_glock_nq(&gh); 1715 ret = gfs2_glock_nq(&gh);
1724 if (ret == 0) { 1716 if (ret == 0) {
1725 ret = generic_setxattr(dentry, name, data, size, flags); 1717 ret = gfs2_rs_alloc(ip);
1718 if (ret == 0)
1719 ret = generic_setxattr(dentry, name, data, size, flags);
1726 gfs2_glock_dq(&gh); 1720 gfs2_glock_dq(&gh);
1727 } 1721 }
1728 gfs2_holder_uninit(&gh); 1722 gfs2_holder_uninit(&gh);
@@ -1757,7 +1751,9 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1757 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1751 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1758 ret = gfs2_glock_nq(&gh); 1752 ret = gfs2_glock_nq(&gh);
1759 if (ret == 0) { 1753 if (ret == 0) {
1760 ret = generic_removexattr(dentry, name); 1754 ret = gfs2_rs_alloc(ip);
1755 if (ret == 0)
1756 ret = generic_removexattr(dentry, name);
1761 gfs2_glock_dq(&gh); 1757 gfs2_glock_dq(&gh);
1762 } 1758 }
1763 gfs2_holder_uninit(&gh); 1759 gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 4a38db739ca0..0fb6539b0c8c 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1289,7 +1289,7 @@ static void gdlm_unmount(struct gfs2_sbd *sdp)
1289 spin_lock(&ls->ls_recover_spin); 1289 spin_lock(&ls->ls_recover_spin);
1290 set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); 1290 set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
1291 spin_unlock(&ls->ls_recover_spin); 1291 spin_unlock(&ls->ls_recover_spin);
1292 flush_delayed_work_sync(&sdp->sd_control_work); 1292 flush_delayed_work(&sdp->sd_control_work);
1293 1293
1294 /* mounted_lock and control_lock will be purged in dlm recovery */ 1294 /* mounted_lock and control_lock will be purged in dlm recovery */
1295release: 1295release:
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e5af9dc420ef..e443966c8106 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/quotaops.h> 21#include <linux/quotaops.h>
22#include <linux/lockdep.h>
22 23
23#include "gfs2.h" 24#include "gfs2.h"
24#include "incore.h" 25#include "incore.h"
@@ -766,6 +767,7 @@ fail:
766 return error; 767 return error;
767} 768}
768 769
770static struct lock_class_key gfs2_quota_imutex_key;
769 771
770static int init_inodes(struct gfs2_sbd *sdp, int undo) 772static int init_inodes(struct gfs2_sbd *sdp, int undo)
771{ 773{
@@ -803,6 +805,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
803 fs_err(sdp, "can't get quota file inode: %d\n", error); 805 fs_err(sdp, "can't get quota file inode: %d\n", error);
804 goto fail_rindex; 806 goto fail_rindex;
805 } 807 }
808 /*
809 * i_mutex on quota files is special. Since this inode is hidden system
810 * file, we are safe to define locking ourselves.
811 */
812 lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
813 &gfs2_quota_imutex_key);
806 814
807 error = gfs2_rindex_update(sdp); 815 error = gfs2_rindex_update(sdp);
808 if (error) 816 if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a3bde91645c2..40c4b0d42fa8 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -765,6 +765,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
765 struct gfs2_holder *ghs, i_gh; 765 struct gfs2_holder *ghs, i_gh;
766 unsigned int qx, x; 766 unsigned int qx, x;
767 struct gfs2_quota_data *qd; 767 struct gfs2_quota_data *qd;
768 unsigned reserved;
768 loff_t offset; 769 loff_t offset;
769 unsigned int nalloc = 0, blocks; 770 unsigned int nalloc = 0, blocks;
770 int error; 771 int error;
@@ -781,7 +782,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
781 return -ENOMEM; 782 return -ENOMEM;
782 783
783 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); 784 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
784 mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA); 785 mutex_lock(&ip->i_inode.i_mutex);
785 for (qx = 0; qx < num_qd; qx++) { 786 for (qx = 0; qx < num_qd; qx++) {
786 error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, 787 error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
787 GL_NOCACHE, &ghs[qx]); 788 GL_NOCACHE, &ghs[qx]);
@@ -811,13 +812,13 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
811 * two blocks need to be updated instead of 1 */ 812 * two blocks need to be updated instead of 1 */
812 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; 813 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
813 814
814 error = gfs2_inplace_reserve(ip, 1 + 815 reserved = 1 + (nalloc * (data_blocks + ind_blocks));
815 (nalloc * (data_blocks + ind_blocks))); 816 error = gfs2_inplace_reserve(ip, reserved);
816 if (error) 817 if (error)
817 goto out_alloc; 818 goto out_alloc;
818 819
819 if (nalloc) 820 if (nalloc)
820 blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS; 821 blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS;
821 822
822 error = gfs2_trans_begin(sdp, blocks, 0); 823 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 824 if (error)
@@ -1070,8 +1071,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1070 1071
1071 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { 1072 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
1072 print_message(qd, "exceeded"); 1073 print_message(qd, "exceeded");
1073 quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? 1074 quota_send_warning(make_kqid(&init_user_ns,
1074 USRQUOTA : GRPQUOTA, qd->qd_id, 1075 test_bit(QDF_USER, &qd->qd_flags) ?
1076 USRQUOTA : GRPQUOTA,
1077 qd->qd_id),
1075 sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); 1078 sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
1076 1079
1077 error = -EDQUOT; 1080 error = -EDQUOT;
@@ -1081,8 +1084,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1081 time_after_eq(jiffies, qd->qd_last_warn + 1084 time_after_eq(jiffies, qd->qd_last_warn +
1082 gfs2_tune_get(sdp, 1085 gfs2_tune_get(sdp,
1083 gt_quota_warn_period) * HZ)) { 1086 gt_quota_warn_period) * HZ)) {
1084 quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? 1087 quota_send_warning(make_kqid(&init_user_ns,
1085 USRQUOTA : GRPQUOTA, qd->qd_id, 1088 test_bit(QDF_USER, &qd->qd_flags) ?
1089 USRQUOTA : GRPQUOTA,
1090 qd->qd_id),
1086 sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); 1091 sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
1087 error = print_message(qd, "warning"); 1092 error = print_message(qd, "warning");
1088 qd->qd_last_warn = jiffies; 1093 qd->qd_last_warn = jiffies;
@@ -1469,7 +1474,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1469 return 0; 1474 return 0;
1470} 1475}
1471 1476
1472static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, 1477static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1473 struct fs_disk_quota *fdq) 1478 struct fs_disk_quota *fdq)
1474{ 1479{
1475 struct gfs2_sbd *sdp = sb->s_fs_info; 1480 struct gfs2_sbd *sdp = sb->s_fs_info;
@@ -1477,20 +1482,21 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1477 struct gfs2_quota_data *qd; 1482 struct gfs2_quota_data *qd;
1478 struct gfs2_holder q_gh; 1483 struct gfs2_holder q_gh;
1479 int error; 1484 int error;
1485 int type;
1480 1486
1481 memset(fdq, 0, sizeof(struct fs_disk_quota)); 1487 memset(fdq, 0, sizeof(struct fs_disk_quota));
1482 1488
1483 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) 1489 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
1484 return -ESRCH; /* Crazy XFS error code */ 1490 return -ESRCH; /* Crazy XFS error code */
1485 1491
1486 if (type == USRQUOTA) 1492 if (qid.type == USRQUOTA)
1487 type = QUOTA_USER; 1493 type = QUOTA_USER;
1488 else if (type == GRPQUOTA) 1494 else if (qid.type == GRPQUOTA)
1489 type = QUOTA_GROUP; 1495 type = QUOTA_GROUP;
1490 else 1496 else
1491 return -EINVAL; 1497 return -EINVAL;
1492 1498
1493 error = qd_get(sdp, type, id, &qd); 1499 error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
1494 if (error) 1500 if (error)
1495 return error; 1501 return error;
1496 error = do_glock(qd, FORCE, &q_gh); 1502 error = do_glock(qd, FORCE, &q_gh);
@@ -1500,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1500 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1506 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
1501 fdq->d_version = FS_DQUOT_VERSION; 1507 fdq->d_version = FS_DQUOT_VERSION;
1502 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1508 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1503 fdq->d_id = id; 1509 fdq->d_id = from_kqid(&init_user_ns, qid);
1504 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; 1510 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
1505 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; 1511 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
1506 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; 1512 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
@@ -1514,7 +1520,7 @@ out:
1514/* GFS2 only supports a subset of the XFS fields */ 1520/* GFS2 only supports a subset of the XFS fields */
1515#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) 1521#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
1516 1522
1517static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, 1523static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
1518 struct fs_disk_quota *fdq) 1524 struct fs_disk_quota *fdq)
1519{ 1525{
1520 struct gfs2_sbd *sdp = sb->s_fs_info; 1526 struct gfs2_sbd *sdp = sb->s_fs_info;
@@ -1526,11 +1532,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1526 int alloc_required; 1532 int alloc_required;
1527 loff_t offset; 1533 loff_t offset;
1528 int error; 1534 int error;
1535 int type;
1529 1536
1530 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) 1537 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
1531 return -ESRCH; /* Crazy XFS error code */ 1538 return -ESRCH; /* Crazy XFS error code */
1532 1539
1533 switch(type) { 1540 switch(qid.type) {
1534 case USRQUOTA: 1541 case USRQUOTA:
1535 type = QUOTA_USER; 1542 type = QUOTA_USER;
1536 if (fdq->d_flags != FS_USER_QUOTA) 1543 if (fdq->d_flags != FS_USER_QUOTA)
@@ -1547,10 +1554,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1547 1554
1548 if (fdq->d_fieldmask & ~GFS2_FIELDMASK) 1555 if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
1549 return -EINVAL; 1556 return -EINVAL;
1550 if (fdq->d_id != id) 1557 if (fdq->d_id != from_kqid(&init_user_ns, qid))
1551 return -EINVAL; 1558 return -EINVAL;
1552 1559
1553 error = qd_get(sdp, type, id, &qd); 1560 error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
1554 if (error) 1561 if (error)
1555 return error; 1562 return error;
1556 1563
@@ -1598,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1598 error = gfs2_inplace_reserve(ip, blocks); 1605 error = gfs2_inplace_reserve(ip, blocks);
1599 if (error) 1606 if (error)
1600 goto out_i; 1607 goto out_i;
1601 blocks += gfs2_rg_blocks(ip); 1608 blocks += gfs2_rg_blocks(ip, blocks);
1602 } 1609 }
1603 1610
1604 /* Some quotas span block boundaries and can update two blocks, 1611 /* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4d34887a601d..3cc402ce6fea 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,9 +35,6 @@
35#define BFITNOENT ((u32)~0) 35#define BFITNOENT ((u32)~0)
36#define NO_BLOCK ((u64)~0) 36#define NO_BLOCK ((u64)~0)
37 37
38#define RSRV_CONTENTION_FACTOR 4
39#define RGRP_RSRV_MAX_CONTENDERS 2
40
41#if BITS_PER_LONG == 32 38#if BITS_PER_LONG == 32
42#define LBITMASK (0x55555555UL) 39#define LBITMASK (0x55555555UL)
43#define LBITSKIP55 (0x55555555UL) 40#define LBITSKIP55 (0x55555555UL)
@@ -67,53 +64,48 @@ static const char valid_change[16] = {
67 1, 0, 0, 0 64 1, 0, 0, 0
68}; 65};
69 66
70static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, 67static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
71 unsigned char old_state, 68 const struct gfs2_inode *ip, bool nowrap);
72 struct gfs2_bitmap **rbi); 69
73 70
74/** 71/**
75 * gfs2_setbit - Set a bit in the bitmaps 72 * gfs2_setbit - Set a bit in the bitmaps
76 * @rgd: the resource group descriptor 73 * @rbm: The position of the bit to set
77 * @buf2: the clone buffer that holds the bitmaps 74 * @do_clone: Also set the clone bitmap, if it exists
78 * @bi: the bitmap structure
79 * @block: the block to set
80 * @new_state: the new state of the block 75 * @new_state: the new state of the block
81 * 76 *
82 */ 77 */
83 78
84static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2, 79static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
85 struct gfs2_bitmap *bi, u32 block,
86 unsigned char new_state) 80 unsigned char new_state)
87{ 81{
88 unsigned char *byte1, *byte2, *end, cur_state; 82 unsigned char *byte1, *byte2, *end, cur_state;
89 unsigned int buflen = bi->bi_len; 83 unsigned int buflen = rbm->bi->bi_len;
90 const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; 84 const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
91 85
92 byte1 = bi->bi_bh->b_data + bi->bi_offset + (block / GFS2_NBBY); 86 byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
93 end = bi->bi_bh->b_data + bi->bi_offset + buflen; 87 end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen;
94 88
95 BUG_ON(byte1 >= end); 89 BUG_ON(byte1 >= end);
96 90
97 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; 91 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
98 92
99 if (unlikely(!valid_change[new_state * 4 + cur_state])) { 93 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
100 printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, " 94 printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
101 "new_state=%d\n", 95 "new_state=%d\n", rbm->offset, cur_state, new_state);
102 (unsigned long long)block, cur_state, new_state); 96 printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
103 printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n", 97 (unsigned long long)rbm->rgd->rd_addr,
104 (unsigned long long)rgd->rd_addr, 98 rbm->bi->bi_start);
105 (unsigned long)bi->bi_start); 99 printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
106 printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n", 100 rbm->bi->bi_offset, rbm->bi->bi_len);
107 (unsigned long)bi->bi_offset,
108 (unsigned long)bi->bi_len);
109 dump_stack(); 101 dump_stack();
110 gfs2_consist_rgrpd(rgd); 102 gfs2_consist_rgrpd(rbm->rgd);
111 return; 103 return;
112 } 104 }
113 *byte1 ^= (cur_state ^ new_state) << bit; 105 *byte1 ^= (cur_state ^ new_state) << bit;
114 106
115 if (buf2) { 107 if (do_clone && rbm->bi->bi_clone) {
116 byte2 = buf2 + bi->bi_offset + (block / GFS2_NBBY); 108 byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
117 cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; 109 cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
118 *byte2 ^= (cur_state ^ new_state) << bit; 110 *byte2 ^= (cur_state ^ new_state) << bit;
119 } 111 }
@@ -121,30 +113,21 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2,
121 113
122/** 114/**
123 * gfs2_testbit - test a bit in the bitmaps 115 * gfs2_testbit - test a bit in the bitmaps
124 * @rgd: the resource group descriptor 116 * @rbm: The bit to test
125 * @buffer: the buffer that holds the bitmaps
126 * @buflen: the length (in bytes) of the buffer
127 * @block: the block to read
128 * 117 *
118 * Returns: The two bit block state of the requested bit
129 */ 119 */
130 120
131static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, 121static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
132 const unsigned char *buffer,
133 unsigned int buflen, u32 block)
134{ 122{
135 const unsigned char *byte, *end; 123 const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset;
136 unsigned char cur_state; 124 const u8 *byte;
137 unsigned int bit; 125 unsigned int bit;
138 126
139 byte = buffer + (block / GFS2_NBBY); 127 byte = buffer + (rbm->offset / GFS2_NBBY);
140 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; 128 bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
141 end = buffer + buflen;
142
143 gfs2_assert(rgd->rd_sbd, byte < end);
144 129
145 cur_state = (*byte >> bit) & GFS2_BIT_MASK; 130 return (*byte >> bit) & GFS2_BIT_MASK;
146
147 return cur_state;
148} 131}
149 132
150/** 133/**
@@ -192,7 +175,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
192 */ 175 */
193static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) 176static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
194{ 177{
195 u64 startblk = gfs2_rs_startblk(rs); 178 u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
196 179
197 if (blk >= startblk + rs->rs_free) 180 if (blk >= startblk + rs->rs_free)
198 return 1; 181 return 1;
@@ -202,36 +185,6 @@ static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
202} 185}
203 186
204/** 187/**
205 * rs_find - Find a rgrp multi-block reservation that contains a given block
206 * @rgd: The rgrp
207 * @rgblk: The block we're looking for, relative to the rgrp
208 */
209static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
210{
211 struct rb_node **newn;
212 int rc;
213 u64 fsblk = rgblk + rgd->rd_data0;
214
215 spin_lock(&rgd->rd_rsspin);
216 newn = &rgd->rd_rstree.rb_node;
217 while (*newn) {
218 struct gfs2_blkreserv *cur =
219 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
220 rc = rs_cmp(fsblk, 1, cur);
221 if (rc < 0)
222 newn = &((*newn)->rb_left);
223 else if (rc > 0)
224 newn = &((*newn)->rb_right);
225 else {
226 spin_unlock(&rgd->rd_rsspin);
227 return cur;
228 }
229 }
230 spin_unlock(&rgd->rd_rsspin);
231 return NULL;
232}
233
234/**
235 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing 188 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
236 * a block in a given allocation state. 189 * a block in a given allocation state.
237 * @buf: the buffer that holds the bitmaps 190 * @buf: the buffer that holds the bitmaps
@@ -262,8 +215,6 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
262 u64 mask = 0x5555555555555555ULL; 215 u64 mask = 0x5555555555555555ULL;
263 u32 bit; 216 u32 bit;
264 217
265 BUG_ON(state > 3);
266
267 /* Mask off bits we don't care about at the start of the search */ 218 /* Mask off bits we don't care about at the start of the search */
268 mask <<= spoint; 219 mask <<= spoint;
269 tmp = gfs2_bit_search(ptr, mask, state); 220 tmp = gfs2_bit_search(ptr, mask, state);
@@ -285,6 +236,131 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
285} 236}
286 237
287/** 238/**
239 * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
240 * @rbm: The rbm with rgd already set correctly
241 * @block: The block number (filesystem relative)
242 *
243 * This sets the bi and offset members of an rbm based on a
244 * resource group and a filesystem relative block number. The
245 * resource group must be set in the rbm on entry, the bi and
246 * offset members will be set by this function.
247 *
248 * Returns: 0 on success, or an error code
249 */
250
251static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
252{
253 u64 rblock = block - rbm->rgd->rd_data0;
254 u32 goal = (u32)rblock;
255 int x;
256
257 if (WARN_ON_ONCE(rblock > UINT_MAX))
258 return -EINVAL;
259 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
260 return -E2BIG;
261
262 for (x = 0; x < rbm->rgd->rd_length; x++) {
263 rbm->bi = rbm->rgd->rd_bits + x;
264 if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) {
265 rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY);
266 break;
267 }
268 }
269
270 return 0;
271}
272
273/**
274 * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
275 * @rbm: Position to search (value/result)
276 * @n_unaligned: Number of unaligned blocks to check
277 * @len: Decremented for each block found (terminate on zero)
278 *
279 * Returns: true if a non-free block is encountered
280 */
281
282static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
283{
284 u64 block;
285 u32 n;
286 u8 res;
287
288 for (n = 0; n < n_unaligned; n++) {
289 res = gfs2_testbit(rbm);
290 if (res != GFS2_BLKST_FREE)
291 return true;
292 (*len)--;
293 if (*len == 0)
294 return true;
295 block = gfs2_rbm_to_block(rbm);
296 if (gfs2_rbm_from_block(rbm, block + 1))
297 return true;
298 }
299
300 return false;
301}
302
303/**
304 * gfs2_free_extlen - Return extent length of free blocks
305 * @rbm: Starting position
306 * @len: Max length to check
307 *
308 * Starting at the block specified by the rbm, see how many free blocks
309 * there are, not reading more than len blocks ahead. This can be done
310 * using memchr_inv when the blocks are byte aligned, but has to be done
311 * on a block by block basis in case of unaligned blocks. Also this
312 * function can cope with bitmap boundaries (although it must stop on
313 * a resource group boundary)
314 *
315 * Returns: Number of free blocks in the extent
316 */
317
318static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
319{
320 struct gfs2_rbm rbm = *rrbm;
321 u32 n_unaligned = rbm.offset & 3;
322 u32 size = len;
323 u32 bytes;
324 u32 chunk_size;
325 u8 *ptr, *start, *end;
326 u64 block;
327
328 if (n_unaligned &&
329 gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
330 goto out;
331
332 n_unaligned = len & 3;
333 /* Start is now byte aligned */
334 while (len > 3) {
335 start = rbm.bi->bi_bh->b_data;
336 if (rbm.bi->bi_clone)
337 start = rbm.bi->bi_clone;
338 end = start + rbm.bi->bi_bh->b_size;
339 start += rbm.bi->bi_offset;
340 BUG_ON(rbm.offset & 3);
341 start += (rbm.offset / GFS2_NBBY);
342 bytes = min_t(u32, len / GFS2_NBBY, (end - start));
343 ptr = memchr_inv(start, 0, bytes);
344 chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
345 chunk_size *= GFS2_NBBY;
346 BUG_ON(len < chunk_size);
347 len -= chunk_size;
348 block = gfs2_rbm_to_block(&rbm);
349 gfs2_rbm_from_block(&rbm, block + chunk_size);
350 n_unaligned = 3;
351 if (ptr)
352 break;
353 n_unaligned = len & 3;
354 }
355
356 /* Deal with any bits left over at the end */
357 if (n_unaligned)
358 gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
359out:
360 return size - len;
361}
362
363/**
288 * gfs2_bitcount - count the number of bits in a certain state 364 * gfs2_bitcount - count the number of bits in a certain state
289 * @rgd: the resource group descriptor 365 * @rgd: the resource group descriptor
290 * @buffer: the buffer that holds the bitmaps 366 * @buffer: the buffer that holds the bitmaps
@@ -487,6 +563,8 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
487 if (!res) 563 if (!res)
488 error = -ENOMEM; 564 error = -ENOMEM;
489 565
566 RB_CLEAR_NODE(&res->rs_node);
567
490 down_write(&ip->i_rw_mutex); 568 down_write(&ip->i_rw_mutex);
491 if (ip->i_res) 569 if (ip->i_res)
492 kmem_cache_free(gfs2_rsrv_cachep, res); 570 kmem_cache_free(gfs2_rsrv_cachep, res);
@@ -496,11 +574,12 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
496 return error; 574 return error;
497} 575}
498 576
499static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs) 577static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
500{ 578{
501 gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u\n", 579 gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n",
502 rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk, 580 (unsigned long long)rs->rs_inum,
503 rs->rs_free); 581 (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
582 rs->rs_rbm.offset, rs->rs_free);
504} 583}
505 584
506/** 585/**
@@ -508,41 +587,26 @@ static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
508 * @rs: The reservation to remove 587 * @rs: The reservation to remove
509 * 588 *
510 */ 589 */
511static void __rs_deltree(struct gfs2_blkreserv *rs) 590static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
512{ 591{
513 struct gfs2_rgrpd *rgd; 592 struct gfs2_rgrpd *rgd;
514 593
515 if (!gfs2_rs_active(rs)) 594 if (!gfs2_rs_active(rs))
516 return; 595 return;
517 596
518 rgd = rs->rs_rgd; 597 rgd = rs->rs_rbm.rgd;
519 /* We can't do this: The reason is that when the rgrp is invalidated, 598 trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
520 it's in the "middle" of acquiring the glock, but the HOLDER bit 599 rb_erase(&rs->rs_node, &rgd->rd_rstree);
521 isn't set yet: 600 RB_CLEAR_NODE(&rs->rs_node);
522 BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
523 trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
524
525 if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
526 rb_erase(&rs->rs_node, &rgd->rd_rstree);
527 BUG_ON(!rgd->rd_rs_cnt);
528 rgd->rd_rs_cnt--;
529 601
530 if (rs->rs_free) { 602 if (rs->rs_free) {
531 /* return reserved blocks to the rgrp and the ip */ 603 /* return reserved blocks to the rgrp and the ip */
532 BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free); 604 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
533 rs->rs_rgd->rd_reserved -= rs->rs_free; 605 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
534 rs->rs_free = 0; 606 rs->rs_free = 0;
535 clear_bit(GBF_FULL, &rs->rs_bi->bi_flags); 607 clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags);
536 smp_mb__after_clear_bit(); 608 smp_mb__after_clear_bit();
537 } 609 }
538 /* We can't change any of the step 1 or step 2 components of the rs.
539 E.g. We can't set rs_rgd to NULL because the rgd glock is held and
540 dequeued through this pointer.
541 Can't: atomic_set(&rs->rs_sizehint, 0);
542 Can't: rs->rs_requested = 0;
543 Can't: rs->rs_rgd = NULL;*/
544 rs->rs_bi = NULL;
545 rs->rs_biblk = 0;
546} 610}
547 611
548/** 612/**
@@ -550,17 +614,16 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
550 * @rs: The reservation to remove 614 * @rs: The reservation to remove
551 * 615 *
552 */ 616 */
553void gfs2_rs_deltree(struct gfs2_blkreserv *rs) 617void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
554{ 618{
555 struct gfs2_rgrpd *rgd; 619 struct gfs2_rgrpd *rgd;
556 620
557 if (!gfs2_rs_active(rs)) 621 rgd = rs->rs_rbm.rgd;
558 return; 622 if (rgd) {
559 623 spin_lock(&rgd->rd_rsspin);
560 rgd = rs->rs_rgd; 624 __rs_deltree(ip, rs);
561 spin_lock(&rgd->rd_rsspin); 625 spin_unlock(&rgd->rd_rsspin);
562 __rs_deltree(rs); 626 }
563 spin_unlock(&rgd->rd_rsspin);
564} 627}
565 628
566/** 629/**
@@ -572,8 +635,7 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
572{ 635{
573 down_write(&ip->i_rw_mutex); 636 down_write(&ip->i_rw_mutex);
574 if (ip->i_res) { 637 if (ip->i_res) {
575 gfs2_rs_deltree(ip->i_res); 638 gfs2_rs_deltree(ip, ip->i_res);
576 trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
577 BUG_ON(ip->i_res->rs_free); 639 BUG_ON(ip->i_res->rs_free);
578 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); 640 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
579 ip->i_res = NULL; 641 ip->i_res = NULL;
@@ -597,7 +659,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd)
597 spin_lock(&rgd->rd_rsspin); 659 spin_lock(&rgd->rd_rsspin);
598 while ((n = rb_first(&rgd->rd_rstree))) { 660 while ((n = rb_first(&rgd->rd_rstree))) {
599 rs = rb_entry(n, struct gfs2_blkreserv, rs_node); 661 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
600 __rs_deltree(rs); 662 __rs_deltree(NULL, rs);
601 } 663 }
602 spin_unlock(&rgd->rd_rsspin); 664 spin_unlock(&rgd->rd_rsspin);
603} 665}
@@ -1270,211 +1332,276 @@ out:
1270 1332
1271/** 1333/**
1272 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree 1334 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1273 * @bi: the bitmap with the blocks
1274 * @ip: the inode structure 1335 * @ip: the inode structure
1275 * @biblk: the 32-bit block number relative to the start of the bitmap
1276 * @amount: the number of blocks to reserve
1277 * 1336 *
1278 * Returns: NULL - reservation was already taken, so not inserted
1279 * pointer to the inserted reservation
1280 */ 1337 */
1281static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi, 1338static void rs_insert(struct gfs2_inode *ip)
1282 struct gfs2_inode *ip, u32 biblk,
1283 int amount)
1284{ 1339{
1285 struct rb_node **newn, *parent = NULL; 1340 struct rb_node **newn, *parent = NULL;
1286 int rc; 1341 int rc;
1287 struct gfs2_blkreserv *rs = ip->i_res; 1342 struct gfs2_blkreserv *rs = ip->i_res;
1288 struct gfs2_rgrpd *rgd = rs->rs_rgd; 1343 struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1289 u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0; 1344 u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1345
1346 BUG_ON(gfs2_rs_active(rs));
1290 1347
1291 spin_lock(&rgd->rd_rsspin); 1348 spin_lock(&rgd->rd_rsspin);
1292 newn = &rgd->rd_rstree.rb_node; 1349 newn = &rgd->rd_rstree.rb_node;
1293 BUG_ON(!ip->i_res);
1294 BUG_ON(gfs2_rs_active(rs));
1295 /* Figure out where to put new node */
1296 /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
1297 while (*newn) { 1350 while (*newn) {
1298 struct gfs2_blkreserv *cur = 1351 struct gfs2_blkreserv *cur =
1299 rb_entry(*newn, struct gfs2_blkreserv, rs_node); 1352 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1300 1353
1301 parent = *newn; 1354 parent = *newn;
1302 rc = rs_cmp(fsblock, amount, cur); 1355 rc = rs_cmp(fsblock, rs->rs_free, cur);
1303 if (rc > 0) 1356 if (rc > 0)
1304 newn = &((*newn)->rb_right); 1357 newn = &((*newn)->rb_right);
1305 else if (rc < 0) 1358 else if (rc < 0)
1306 newn = &((*newn)->rb_left); 1359 newn = &((*newn)->rb_left);
1307 else { 1360 else {
1308 spin_unlock(&rgd->rd_rsspin); 1361 spin_unlock(&rgd->rd_rsspin);
1309 return NULL; /* reservation already in use */ 1362 WARN_ON(1);
1363 return;
1310 } 1364 }
1311 } 1365 }
1312 1366
1313 /* Do our reservation work */
1314 rs = ip->i_res;
1315 rs->rs_free = amount;
1316 rs->rs_biblk = biblk;
1317 rs->rs_bi = bi;
1318 rb_link_node(&rs->rs_node, parent, newn); 1367 rb_link_node(&rs->rs_node, parent, newn);
1319 rb_insert_color(&rs->rs_node, &rgd->rd_rstree); 1368 rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1320 1369
1321 /* Do our inode accounting for the reservation */
1322 /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
1323
1324 /* Do our rgrp accounting for the reservation */ 1370 /* Do our rgrp accounting for the reservation */
1325 rgd->rd_reserved += amount; /* blocks reserved */ 1371 rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1326 rgd->rd_rs_cnt++; /* number of in-tree reservations */
1327 spin_unlock(&rgd->rd_rsspin); 1372 spin_unlock(&rgd->rd_rsspin);
1328 trace_gfs2_rs(ip, rs, TRACE_RS_INSERT); 1373 trace_gfs2_rs(rs, TRACE_RS_INSERT);
1329 return rs;
1330} 1374}
1331 1375
1332/** 1376/**
1333 * unclaimed_blocks - return number of blocks that aren't spoken for 1377 * rg_mblk_search - find a group of multiple free blocks to form a reservation
1334 */
1335static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
1336{
1337 return rgd->rd_free_clone - rgd->rd_reserved;
1338}
1339
1340/**
1341 * rg_mblk_search - find a group of multiple free blocks
1342 * @rgd: the resource group descriptor 1378 * @rgd: the resource group descriptor
1343 * @rs: the block reservation
1344 * @ip: pointer to the inode for which we're reserving blocks 1379 * @ip: pointer to the inode for which we're reserving blocks
1380 * @requested: number of blocks required for this allocation
1345 * 1381 *
1346 * This is very similar to rgblk_search, except we're looking for whole
1347 * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
1348 * on aligned dwords for speed's sake.
1349 *
1350 * Returns: 0 if successful or BFITNOENT if there isn't enough free space
1351 */ 1382 */
1352 1383
1353static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) 1384static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1385 unsigned requested)
1354{ 1386{
1355 struct gfs2_bitmap *bi = rgd->rd_bits; 1387 struct gfs2_rbm rbm = { .rgd = rgd, };
1356 const u32 length = rgd->rd_length; 1388 u64 goal;
1357 u32 blk; 1389 struct gfs2_blkreserv *rs = ip->i_res;
1358 unsigned int buf, x, search_bytes; 1390 u32 extlen;
1359 u8 *buffer = NULL; 1391 u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
1360 u8 *ptr, *end, *nonzero; 1392 int ret;
1361 u32 goal, rsv_bytes; 1393
1362 struct gfs2_blkreserv *rs; 1394 extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
1363 u32 best_rs_bytes, unclaimed; 1395 extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1364 int best_rs_blocks; 1396 if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1397 return;
1365 1398
1366 /* Find bitmap block that contains bits for goal block */ 1399 /* Find bitmap block that contains bits for goal block */
1367 if (rgrp_contains_block(rgd, ip->i_goal)) 1400 if (rgrp_contains_block(rgd, ip->i_goal))
1368 goal = ip->i_goal - rgd->rd_data0; 1401 goal = ip->i_goal;
1369 else 1402 else
1370 goal = rgd->rd_last_alloc; 1403 goal = rgd->rd_last_alloc + rgd->rd_data0;
1371 for (buf = 0; buf < length; buf++) { 1404
1372 bi = rgd->rd_bits + buf; 1405 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1373 /* Convert scope of "goal" from rgrp-wide to within 1406 return;
1374 found bit block */ 1407
1375 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { 1408 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
1376 goal -= bi->bi_start * GFS2_NBBY; 1409 if (ret == 0) {
1377 goto do_search; 1410 rs->rs_rbm = rbm;
1378 } 1411 rs->rs_free = extlen;
1412 rs->rs_inum = ip->i_no_addr;
1413 rs_insert(ip);
1379 } 1414 }
1380 buf = 0; 1415}
1381 goal = 0;
1382
1383do_search:
1384 best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
1385 (RGRP_RSRV_MINBLKS * rgd->rd_length));
1386 best_rs_bytes = (best_rs_blocks *
1387 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
1388 GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
1389 best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
1390 unclaimed = unclaimed_blocks(rgd);
1391 if (best_rs_bytes * GFS2_NBBY > unclaimed)
1392 best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
1393
1394 for (x = 0; x <= length; x++) {
1395 bi = rgd->rd_bits + buf;
1396 1416
1397 if (test_bit(GBF_FULL, &bi->bi_flags)) 1417/**
1398 goto skip; 1418 * gfs2_next_unreserved_block - Return next block that is not reserved
1419 * @rgd: The resource group
1420 * @block: The starting block
1421 * @length: The required length
1422 * @ip: Ignore any reservations for this inode
1423 *
1424 * If the block does not appear in any reservation, then return the
1425 * block number unchanged. If it does appear in the reservation, then
1426 * keep looking through the tree of reservations in order to find the
1427 * first block number which is not reserved.
1428 */
1399 1429
1400 WARN_ON(!buffer_uptodate(bi->bi_bh)); 1430static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1401 if (bi->bi_clone) 1431 u32 length,
1402 buffer = bi->bi_clone + bi->bi_offset; 1432 const struct gfs2_inode *ip)
1433{
1434 struct gfs2_blkreserv *rs;
1435 struct rb_node *n;
1436 int rc;
1437
1438 spin_lock(&rgd->rd_rsspin);
1439 n = rgd->rd_rstree.rb_node;
1440 while (n) {
1441 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1442 rc = rs_cmp(block, length, rs);
1443 if (rc < 0)
1444 n = n->rb_left;
1445 else if (rc > 0)
1446 n = n->rb_right;
1403 else 1447 else
1404 buffer = bi->bi_bh->b_data + bi->bi_offset; 1448 break;
1405 1449 }
1406 /* We have to keep the reservations aligned on u64 boundaries 1450
1407 otherwise we could get situations where a byte can't be 1451 if (n) {
1408 used because it's after a reservation, but a free bit still 1452 while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
1409 is within the reservation's area. */ 1453 block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1410 ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64)); 1454 n = n->rb_right;
1411 end = (buffer + bi->bi_len); 1455 if (n == NULL)
1412 while (ptr < end) { 1456 break;
1413 rsv_bytes = 0; 1457 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1414 if ((ptr + best_rs_bytes) <= end)
1415 search_bytes = best_rs_bytes;
1416 else
1417 search_bytes = end - ptr;
1418 BUG_ON(!search_bytes);
1419 nonzero = memchr_inv(ptr, 0, search_bytes);
1420 /* If the lot is all zeroes, reserve the whole size. If
1421 there's enough zeroes to satisfy the request, use
1422 what we can. If there's not enough, keep looking. */
1423 if (nonzero == NULL)
1424 rsv_bytes = search_bytes;
1425 else if ((nonzero - ptr) * GFS2_NBBY >=
1426 ip->i_res->rs_requested)
1427 rsv_bytes = (nonzero - ptr);
1428
1429 if (rsv_bytes) {
1430 blk = ((ptr - buffer) * GFS2_NBBY);
1431 BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
1432 rs = rs_insert(bi, ip, blk,
1433 rsv_bytes * GFS2_NBBY);
1434 if (IS_ERR(rs))
1435 return PTR_ERR(rs);
1436 if (rs)
1437 return 0;
1438 }
1439 ptr += ALIGN(search_bytes, sizeof(u64));
1440 } 1458 }
1441skip:
1442 /* Try next bitmap block (wrap back to rgrp header
1443 if at end) */
1444 buf++;
1445 buf %= length;
1446 goal = 0;
1447 } 1459 }
1448 1460
1449 return BFITNOENT; 1461 spin_unlock(&rgd->rd_rsspin);
1462 return block;
1450} 1463}
1451 1464
1452/** 1465/**
1453 * try_rgrp_fit - See if a given reservation will fit in a given RG 1466 * gfs2_reservation_check_and_update - Check for reservations during block alloc
1454 * @rgd: the RG data 1467 * @rbm: The current position in the resource group
1455 * @ip: the inode 1468 * @ip: The inode for which we are searching for blocks
1469 * @minext: The minimum extent length
1456 * 1470 *
1457 * If there's room for the requested blocks to be allocated from the RG: 1471 * This checks the current position in the rgrp to see whether there is
1458 * This will try to get a multi-block reservation first, and if that doesn't 1472 * a reservation covering this block. If not then this function is a
1459 * fit, it will take what it can. 1473 * no-op. If there is, then the position is moved to the end of the
1474 * contiguous reservation(s) so that we are pointing at the first
1475 * non-reserved block.
1460 * 1476 *
1461 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) 1477 * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1462 */ 1478 */
1463 1479
1464static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) 1480static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1481 const struct gfs2_inode *ip,
1482 u32 minext)
1465{ 1483{
1466 struct gfs2_blkreserv *rs = ip->i_res; 1484 u64 block = gfs2_rbm_to_block(rbm);
1485 u32 extlen = 1;
1486 u64 nblock;
1487 int ret;
1467 1488
1468 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 1489 /*
1490 * If we have a minimum extent length, then skip over any extent
1491 * which is less than the min extent length in size.
1492 */
1493 if (minext) {
1494 extlen = gfs2_free_extlen(rbm, minext);
1495 nblock = block + extlen;
1496 if (extlen < minext)
1497 goto fail;
1498 }
1499
1500 /*
1501 * Check the extent which has been found against the reservations
1502 * and skip if parts of it are already reserved
1503 */
1504 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1505 if (nblock == block)
1469 return 0; 1506 return 0;
1470 /* Look for a multi-block reservation. */ 1507fail:
1471 if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS && 1508 ret = gfs2_rbm_from_block(rbm, nblock);
1472 rg_mblk_search(rgd, ip) != BFITNOENT) 1509 if (ret < 0)
1473 return 1; 1510 return ret;
1474 if (unclaimed_blocks(rgd) >= rs->rs_requested) 1511 return 1;
1475 return 1; 1512}
1476 1513
1477 return 0; 1514/**
1515 * gfs2_rbm_find - Look for blocks of a particular state
1516 * @rbm: Value/result starting position and final position
1517 * @state: The state which we want to find
1518 * @minext: The requested extent length (0 for a single block)
1519 * @ip: If set, check for reservations
1520 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1521 * around until we've reached the starting point.
1522 *
1523 * Side effects:
1524 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1525 * has no free blocks in it.
1526 *
1527 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1528 */
1529
1530static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
1531 const struct gfs2_inode *ip, bool nowrap)
1532{
1533 struct buffer_head *bh;
1534 struct gfs2_bitmap *initial_bi;
1535 u32 initial_offset;
1536 u32 offset;
1537 u8 *buffer;
1538 int index;
1539 int n = 0;
1540 int iters = rbm->rgd->rd_length;
1541 int ret;
1542
1543 /* If we are not starting at the beginning of a bitmap, then we
1544 * need to add one to the bitmap count to ensure that we search
1545 * the starting bitmap twice.
1546 */
1547 if (rbm->offset != 0)
1548 iters++;
1549
1550 while(1) {
1551 if (test_bit(GBF_FULL, &rbm->bi->bi_flags) &&
1552 (state == GFS2_BLKST_FREE))
1553 goto next_bitmap;
1554
1555 bh = rbm->bi->bi_bh;
1556 buffer = bh->b_data + rbm->bi->bi_offset;
1557 WARN_ON(!buffer_uptodate(bh));
1558 if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone)
1559 buffer = rbm->bi->bi_clone + rbm->bi->bi_offset;
1560 initial_offset = rbm->offset;
1561 offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state);
1562 if (offset == BFITNOENT)
1563 goto bitmap_full;
1564 rbm->offset = offset;
1565 if (ip == NULL)
1566 return 0;
1567
1568 initial_bi = rbm->bi;
1569 ret = gfs2_reservation_check_and_update(rbm, ip, minext);
1570 if (ret == 0)
1571 return 0;
1572 if (ret > 0) {
1573 n += (rbm->bi - initial_bi);
1574 goto next_iter;
1575 }
1576 if (ret == -E2BIG) {
1577 index = 0;
1578 rbm->offset = 0;
1579 n += (rbm->bi - initial_bi);
1580 goto res_covered_end_of_rgrp;
1581 }
1582 return ret;
1583
1584bitmap_full: /* Mark bitmap as full and fall through */
1585 if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1586 set_bit(GBF_FULL, &rbm->bi->bi_flags);
1587
1588next_bitmap: /* Find next bitmap in the rgrp */
1589 rbm->offset = 0;
1590 index = rbm->bi - rbm->rgd->rd_bits;
1591 index++;
1592 if (index == rbm->rgd->rd_length)
1593 index = 0;
1594res_covered_end_of_rgrp:
1595 rbm->bi = &rbm->rgd->rd_bits[index];
1596 if ((index == 0) && nowrap)
1597 break;
1598 n++;
1599next_iter:
1600 if (n >= iters)
1601 break;
1602 }
1603
1604 return -ENOSPC;
1478} 1605}
1479 1606
1480/** 1607/**
@@ -1489,34 +1616,33 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1489 1616
1490static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) 1617static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1491{ 1618{
1492 u32 goal = 0, block; 1619 u64 block;
1493 u64 no_addr;
1494 struct gfs2_sbd *sdp = rgd->rd_sbd; 1620 struct gfs2_sbd *sdp = rgd->rd_sbd;
1495 struct gfs2_glock *gl; 1621 struct gfs2_glock *gl;
1496 struct gfs2_inode *ip; 1622 struct gfs2_inode *ip;
1497 int error; 1623 int error;
1498 int found = 0; 1624 int found = 0;
1499 struct gfs2_bitmap *bi; 1625 struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 };
1500 1626
1501 while (goal < rgd->rd_data) { 1627 while (1) {
1502 down_write(&sdp->sd_log_flush_lock); 1628 down_write(&sdp->sd_log_flush_lock);
1503 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi); 1629 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
1504 up_write(&sdp->sd_log_flush_lock); 1630 up_write(&sdp->sd_log_flush_lock);
1505 if (block == BFITNOENT) 1631 if (error == -ENOSPC)
1632 break;
1633 if (WARN_ON_ONCE(error))
1506 break; 1634 break;
1507 1635
1508 block = gfs2_bi2rgd_blk(bi, block); 1636 block = gfs2_rbm_to_block(&rbm);
1509 /* rgblk_search can return a block < goal, so we need to 1637 if (gfs2_rbm_from_block(&rbm, block + 1))
1510 keep it marching forward. */ 1638 break;
1511 no_addr = block + rgd->rd_data0; 1639 if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1512 goal = max(block + 1, goal + 1);
1513 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
1514 continue; 1640 continue;
1515 if (no_addr == skip) 1641 if (block == skip)
1516 continue; 1642 continue;
1517 *last_unlinked = no_addr; 1643 *last_unlinked = block;
1518 1644
1519 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); 1645 error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
1520 if (error) 1646 if (error)
1521 continue; 1647 continue;
1522 1648
@@ -1543,6 +1669,19 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1543 return; 1669 return;
1544} 1670}
1545 1671
1672static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1673{
1674 struct gfs2_rgrpd *rgd = *pos;
1675
1676 rgd = gfs2_rgrpd_get_next(rgd);
1677 if (rgd == NULL)
1678 rgd = gfs2_rgrpd_get_next(NULL);
1679 *pos = rgd;
1680 if (rgd != begin) /* If we didn't wrap */
1681 return true;
1682 return false;
1683}
1684
1546/** 1685/**
1547 * gfs2_inplace_reserve - Reserve space in the filesystem 1686 * gfs2_inplace_reserve - Reserve space in the filesystem
1548 * @ip: the inode to reserve space for 1687 * @ip: the inode to reserve space for
@@ -1562,103 +1701,96 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1562 1701
1563 if (sdp->sd_args.ar_rgrplvb) 1702 if (sdp->sd_args.ar_rgrplvb)
1564 flags |= GL_SKIP; 1703 flags |= GL_SKIP;
1565 rs->rs_requested = requested; 1704 if (gfs2_assert_warn(sdp, requested))
1566 if (gfs2_assert_warn(sdp, requested)) { 1705 return -EINVAL;
1567 error = -EINVAL;
1568 goto out;
1569 }
1570 if (gfs2_rs_active(rs)) { 1706 if (gfs2_rs_active(rs)) {
1571 begin = rs->rs_rgd; 1707 begin = rs->rs_rbm.rgd;
1572 flags = 0; /* Yoda: Do or do not. There is no try */ 1708 flags = 0; /* Yoda: Do or do not. There is no try */
1573 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { 1709 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1574 rs->rs_rgd = begin = ip->i_rgd; 1710 rs->rs_rbm.rgd = begin = ip->i_rgd;
1575 } else { 1711 } else {
1576 rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1712 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1577 } 1713 }
1578 if (rs->rs_rgd == NULL) 1714 if (rs->rs_rbm.rgd == NULL)
1579 return -EBADSLT; 1715 return -EBADSLT;
1580 1716
1581 while (loops < 3) { 1717 while (loops < 3) {
1582 rg_locked = 0; 1718 rg_locked = 1;
1583 1719
1584 if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) { 1720 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
1585 rg_locked = 1; 1721 rg_locked = 0;
1586 error = 0; 1722 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
1587 } else if (!loops && !gfs2_rs_active(rs) &&
1588 rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
1589 /* If the rgrp already is maxed out for contenders,
1590 we can eliminate it as a "first pass" without even
1591 requesting the rgrp glock. */
1592 error = GLR_TRYFAILED;
1593 } else {
1594 error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
1595 LM_ST_EXCLUSIVE, flags, 1723 LM_ST_EXCLUSIVE, flags,
1596 &rs->rs_rgd_gh); 1724 &rs->rs_rgd_gh);
1597 if (!error && sdp->sd_args.ar_rgrplvb) { 1725 if (error == GLR_TRYFAILED)
1598 error = update_rgrp_lvb(rs->rs_rgd); 1726 goto next_rgrp;
1599 if (error) { 1727 if (unlikely(error))
1728 return error;
1729 if (sdp->sd_args.ar_rgrplvb) {
1730 error = update_rgrp_lvb(rs->rs_rbm.rgd);
1731 if (unlikely(error)) {
1600 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1732 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1601 return error; 1733 return error;
1602 } 1734 }
1603 } 1735 }
1604 } 1736 }
1605 switch (error) {
1606 case 0:
1607 if (gfs2_rs_active(rs)) {
1608 if (unclaimed_blocks(rs->rs_rgd) +
1609 rs->rs_free >= rs->rs_requested) {
1610 ip->i_rgd = rs->rs_rgd;
1611 return 0;
1612 }
1613 /* We have a multi-block reservation, but the
1614 rgrp doesn't have enough free blocks to
1615 satisfy the request. Free the reservation
1616 and look for a suitable rgrp. */
1617 gfs2_rs_deltree(rs);
1618 }
1619 if (try_rgrp_fit(rs->rs_rgd, ip)) {
1620 if (sdp->sd_args.ar_rgrplvb)
1621 gfs2_rgrp_bh_get(rs->rs_rgd);
1622 ip->i_rgd = rs->rs_rgd;
1623 return 0;
1624 }
1625 if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
1626 if (sdp->sd_args.ar_rgrplvb)
1627 gfs2_rgrp_bh_get(rs->rs_rgd);
1628 try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
1629 ip->i_no_addr);
1630 }
1631 if (!rg_locked)
1632 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1633 /* fall through */
1634 case GLR_TRYFAILED:
1635 rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
1636 rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */
1637 if (rs->rs_rgd != begin) /* If we didn't wrap */
1638 break;
1639 1737
1640 flags &= ~LM_FLAG_TRY; 1738 /* Skip unuseable resource groups */
1641 loops++; 1739 if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
1642 /* Check that fs hasn't grown if writing to rindex */ 1740 goto skip_rgrp;
1643 if (ip == GFS2_I(sdp->sd_rindex) && 1741
1644 !sdp->sd_rindex_uptodate) { 1742 if (sdp->sd_args.ar_rgrplvb)
1645 error = gfs2_ri_update(ip); 1743 gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
1646 if (error) 1744
1647 goto out; 1745 /* Get a reservation if we don't already have one */
1648 } else if (loops == 2) 1746 if (!gfs2_rs_active(rs))
1649 /* Flushing the log may release space */ 1747 rg_mblk_search(rs->rs_rbm.rgd, ip, requested);
1650 gfs2_log_flush(sdp, NULL); 1748
1651 break; 1749 /* Skip rgrps when we can't get a reservation on first pass */
1652 default: 1750 if (!gfs2_rs_active(rs) && (loops < 1))
1653 goto out; 1751 goto check_rgrp;
1752
1753 /* If rgrp has enough free space, use it */
1754 if (rs->rs_rbm.rgd->rd_free_clone >= requested) {
1755 ip->i_rgd = rs->rs_rbm.rgd;
1756 return 0;
1757 }
1758
1759 /* Drop reservation, if we couldn't use reserved rgrp */
1760 if (gfs2_rs_active(rs))
1761 gfs2_rs_deltree(ip, rs);
1762check_rgrp:
1763 /* Check for unlinked inodes which can be reclaimed */
1764 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
1765 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
1766 ip->i_no_addr);
1767skip_rgrp:
1768 /* Unlock rgrp if required */
1769 if (!rg_locked)
1770 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1771next_rgrp:
1772 /* Find the next rgrp, and continue looking */
1773 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
1774 continue;
1775
1776 /* If we've scanned all the rgrps, but found no free blocks
1777 * then this checks for some less likely conditions before
1778 * trying again.
1779 */
1780 flags &= ~LM_FLAG_TRY;
1781 loops++;
1782 /* Check that fs hasn't grown if writing to rindex */
1783 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
1784 error = gfs2_ri_update(ip);
1785 if (error)
1786 return error;
1654 } 1787 }
1788 /* Flushing the log may release space */
1789 if (loops == 2)
1790 gfs2_log_flush(sdp, NULL);
1655 } 1791 }
1656 error = -ENOSPC;
1657 1792
1658out: 1793 return -ENOSPC;
1659 if (error)
1660 rs->rs_requested = 0;
1661 return error;
1662} 1794}
1663 1795
1664/** 1796/**
@@ -1672,15 +1804,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1672{ 1804{
1673 struct gfs2_blkreserv *rs = ip->i_res; 1805 struct gfs2_blkreserv *rs = ip->i_res;
1674 1806
1675 if (!rs)
1676 return;
1677
1678 if (!rs->rs_free)
1679 gfs2_rs_deltree(rs);
1680
1681 if (rs->rs_rgd_gh.gh_gl) 1807 if (rs->rs_rgd_gh.gh_gl)
1682 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1808 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1683 rs->rs_requested = 0;
1684} 1809}
1685 1810
1686/** 1811/**
@@ -1693,173 +1818,47 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1693 1818
1694static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) 1819static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1695{ 1820{
1696 struct gfs2_bitmap *bi = NULL; 1821 struct gfs2_rbm rbm = { .rgd = rgd, };
1697 u32 length, rgrp_block, buf_block; 1822 int ret;
1698 unsigned int buf;
1699 unsigned char type;
1700
1701 length = rgd->rd_length;
1702 rgrp_block = block - rgd->rd_data0;
1703
1704 for (buf = 0; buf < length; buf++) {
1705 bi = rgd->rd_bits + buf;
1706 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1707 break;
1708 }
1709 1823
1710 gfs2_assert(rgd->rd_sbd, buf < length); 1824 ret = gfs2_rbm_from_block(&rbm, block);
1711 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; 1825 WARN_ON_ONCE(ret != 0);
1712 1826
1713 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1827 return gfs2_testbit(&rbm);
1714 bi->bi_len, buf_block);
1715
1716 return type;
1717} 1828}
1718 1829
1719/**
1720 * rgblk_search - find a block in @state
1721 * @rgd: the resource group descriptor
1722 * @goal: the goal block within the RG (start here to search for avail block)
1723 * @state: GFS2_BLKST_XXX the before-allocation state to find
1724 * @rbi: address of the pointer to the bitmap containing the block found
1725 *
1726 * Walk rgrp's bitmap to find bits that represent a block in @state.
1727 *
1728 * This function never fails, because we wouldn't call it unless we
1729 * know (from reservation results, etc.) that a block is available.
1730 *
1731 * Scope of @goal is just within rgrp, not the whole filesystem.
1732 * Scope of @returned block is just within bitmap, not the whole filesystem.
1733 *
1734 * Returns: the block number found relative to the bitmap rbi
1735 */
1736
1737static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char state,
1738 struct gfs2_bitmap **rbi)
1739{
1740 struct gfs2_bitmap *bi = NULL;
1741 const u32 length = rgd->rd_length;
1742 u32 biblk = BFITNOENT;
1743 unsigned int buf, x;
1744 const u8 *buffer = NULL;
1745
1746 *rbi = NULL;
1747 /* Find bitmap block that contains bits for goal block */
1748 for (buf = 0; buf < length; buf++) {
1749 bi = rgd->rd_bits + buf;
1750 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1751 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
1752 goal -= bi->bi_start * GFS2_NBBY;
1753 goto do_search;
1754 }
1755 }
1756 buf = 0;
1757 goal = 0;
1758
1759do_search:
1760 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1761 "x <= length", instead of "x < length", because we typically start
1762 the search in the middle of a bit block, but if we can't find an
1763 allocatable block anywhere else, we want to be able wrap around and
1764 search in the first part of our first-searched bit block. */
1765 for (x = 0; x <= length; x++) {
1766 bi = rgd->rd_bits + buf;
1767
1768 if (test_bit(GBF_FULL, &bi->bi_flags) &&
1769 (state == GFS2_BLKST_FREE))
1770 goto skip;
1771
1772 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1773 bitmaps, so we must search the originals for that. */
1774 buffer = bi->bi_bh->b_data + bi->bi_offset;
1775 WARN_ON(!buffer_uptodate(bi->bi_bh));
1776 if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1777 buffer = bi->bi_clone + bi->bi_offset;
1778
1779 while (1) {
1780 struct gfs2_blkreserv *rs;
1781 u32 rgblk;
1782
1783 biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
1784 if (biblk == BFITNOENT)
1785 break;
1786 /* Check if this block is reserved() */
1787 rgblk = gfs2_bi2rgd_blk(bi, biblk);
1788 rs = rs_find(rgd, rgblk);
1789 if (rs == NULL)
1790 break;
1791
1792 BUG_ON(rs->rs_bi != bi);
1793 biblk = BFITNOENT;
1794 /* This should jump to the first block after the
1795 reservation. */
1796 goal = rs->rs_biblk + rs->rs_free;
1797 if (goal >= bi->bi_len * GFS2_NBBY)
1798 break;
1799 }
1800 if (biblk != BFITNOENT)
1801 break;
1802
1803 if ((goal == 0) && (state == GFS2_BLKST_FREE))
1804 set_bit(GBF_FULL, &bi->bi_flags);
1805
1806 /* Try next bitmap block (wrap back to rgrp header if at end) */
1807skip:
1808 buf++;
1809 buf %= length;
1810 goal = 0;
1811 }
1812
1813 if (biblk != BFITNOENT)
1814 *rbi = bi;
1815
1816 return biblk;
1817}
1818 1830
1819/** 1831/**
1820 * gfs2_alloc_extent - allocate an extent from a given bitmap 1832 * gfs2_alloc_extent - allocate an extent from a given bitmap
1821 * @rgd: the resource group descriptor 1833 * @rbm: the resource group information
1822 * @bi: the bitmap within the rgrp
1823 * @blk: the block within the bitmap
1824 * @dinode: TRUE if the first block we allocate is for a dinode 1834 * @dinode: TRUE if the first block we allocate is for a dinode
1825 * @n: The extent length 1835 * @n: The extent length (value/result)
1826 * 1836 *
1827 * Add the found bitmap buffer to the transaction. 1837 * Add the bitmap buffer to the transaction.
1828 * Set the found bits to @new_state to change block's allocation state. 1838 * Set the found bits to @new_state to change block's allocation state.
1829 * Returns: starting block number of the extent (fs scope)
1830 */ 1839 */
1831static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, 1840static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
1832 u32 blk, bool dinode, unsigned int *n) 1841 unsigned int *n)
1833{ 1842{
1843 struct gfs2_rbm pos = { .rgd = rbm->rgd, };
1834 const unsigned int elen = *n; 1844 const unsigned int elen = *n;
1835 u32 goal, rgblk; 1845 u64 block;
1836 const u8 *buffer = NULL; 1846 int ret;
1837 struct gfs2_blkreserv *rs; 1847
1838 1848 *n = 1;
1839 *n = 0; 1849 block = gfs2_rbm_to_block(rbm);
1840 buffer = bi->bi_bh->b_data + bi->bi_offset; 1850 gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
1841 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1851 gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
1842 gfs2_setbit(rgd, bi->bi_clone, bi, blk, 1852 block++;
1843 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
1844 (*n)++;
1845 goal = blk;
1846 while (*n < elen) { 1853 while (*n < elen) {
1847 goal++; 1854 ret = gfs2_rbm_from_block(&pos, block);
1848 if (goal >= (bi->bi_len * GFS2_NBBY)) 1855 if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
1849 break;
1850 rgblk = gfs2_bi2rgd_blk(bi, goal);
1851 rs = rs_find(rgd, rgblk);
1852 if (rs) /* Oops, we bumped into someone's reservation */
1853 break;
1854 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1855 GFS2_BLKST_FREE)
1856 break; 1856 break;
1857 gfs2_setbit(rgd, bi->bi_clone, bi, goal, GFS2_BLKST_USED); 1857 gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
1858 gfs2_setbit(&pos, true, GFS2_BLKST_USED);
1858 (*n)++; 1859 (*n)++;
1860 block++;
1859 } 1861 }
1860 blk = gfs2_bi2rgd_blk(bi, blk);
1861 rgd->rd_last_alloc = blk + *n - 1;
1862 return rgd->rd_data0 + blk;
1863} 1862}
1864 1863
1865/** 1864/**
@@ -1875,46 +1874,30 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
1875static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, 1874static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1876 u32 blen, unsigned char new_state) 1875 u32 blen, unsigned char new_state)
1877{ 1876{
1878 struct gfs2_rgrpd *rgd; 1877 struct gfs2_rbm rbm;
1879 struct gfs2_bitmap *bi = NULL;
1880 u32 length, rgrp_blk, buf_blk;
1881 unsigned int buf;
1882 1878
1883 rgd = gfs2_blk2rgrpd(sdp, bstart, 1); 1879 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
1884 if (!rgd) { 1880 if (!rbm.rgd) {
1885 if (gfs2_consist(sdp)) 1881 if (gfs2_consist(sdp))
1886 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); 1882 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1887 return NULL; 1883 return NULL;
1888 } 1884 }
1889 1885
1890 length = rgd->rd_length;
1891
1892 rgrp_blk = bstart - rgd->rd_data0;
1893
1894 while (blen--) { 1886 while (blen--) {
1895 for (buf = 0; buf < length; buf++) { 1887 gfs2_rbm_from_block(&rbm, bstart);
1896 bi = rgd->rd_bits + buf; 1888 bstart++;
1897 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) 1889 if (!rbm.bi->bi_clone) {
1898 break; 1890 rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size,
1899 } 1891 GFP_NOFS | __GFP_NOFAIL);
1900 1892 memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset,
1901 gfs2_assert(rgd->rd_sbd, buf < length); 1893 rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
1902 1894 rbm.bi->bi_len);
1903 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1904 rgrp_blk++;
1905
1906 if (!bi->bi_clone) {
1907 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1908 GFP_NOFS | __GFP_NOFAIL);
1909 memcpy(bi->bi_clone + bi->bi_offset,
1910 bi->bi_bh->b_data + bi->bi_offset,
1911 bi->bi_len);
1912 } 1895 }
1913 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1896 gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
1914 gfs2_setbit(rgd, NULL, bi, buf_blk, new_state); 1897 gfs2_setbit(&rbm, false, new_state);
1915 } 1898 }
1916 1899
1917 return rgd; 1900 return rbm.rgd;
1918} 1901}
1919 1902
1920/** 1903/**
@@ -1956,56 +1939,41 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
1956} 1939}
1957 1940
1958/** 1941/**
1959 * claim_reserved_blks - Claim previously reserved blocks 1942 * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
1960 * @ip: the inode that's claiming the reservation 1943 * @ip: The inode we have just allocated blocks for
1961 * @dinode: 1 if this block is a dinode block, otherwise data block 1944 * @rbm: The start of the allocated blocks
1962 * @nblocks: desired extent length 1945 * @len: The extent length
1963 * 1946 *
1964 * Lay claim to previously allocated block reservation blocks. 1947 * Adjusts a reservation after an allocation has taken place. If the
1965 * Returns: Starting block number of the blocks claimed. 1948 * reservation does not match the allocation, or if it is now empty
1966 * Sets *nblocks to the actual extent length allocated. 1949 * then it is removed.
1967 */ 1950 */
1968static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode, 1951
1969 unsigned int *nblocks) 1952static void gfs2_adjust_reservation(struct gfs2_inode *ip,
1953 const struct gfs2_rbm *rbm, unsigned len)
1970{ 1954{
1971 struct gfs2_blkreserv *rs = ip->i_res; 1955 struct gfs2_blkreserv *rs = ip->i_res;
1972 struct gfs2_rgrpd *rgd = rs->rs_rgd; 1956 struct gfs2_rgrpd *rgd = rbm->rgd;
1973 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1957 unsigned rlen;
1974 struct gfs2_bitmap *bi; 1958 u64 block;
1975 u64 start_block = gfs2_rs_startblk(rs); 1959 int ret;
1976 const unsigned int elen = *nblocks;
1977
1978 /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
1979 gfs2_assert_withdraw(sdp, rgd);
1980 /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
1981 bi = rs->rs_bi;
1982 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1983
1984 for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
1985 /* Make sure the bitmap hasn't changed */
1986 gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
1987 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
1988 rs->rs_biblk++;
1989 rs->rs_free--;
1990
1991 BUG_ON(!rgd->rd_reserved);
1992 rgd->rd_reserved--;
1993 dinode = false;
1994 trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
1995 }
1996
1997 if (!rs->rs_free) {
1998 struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
1999 1960
2000 gfs2_rs_deltree(rs); 1961 spin_lock(&rgd->rd_rsspin);
2001 /* -nblocks because we haven't returned to do the math yet. 1962 if (gfs2_rs_active(rs)) {
2002 I'm doing the math backwards to prevent negative numbers, 1963 if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2003 but think of it as: 1964 block = gfs2_rbm_to_block(rbm);
2004 if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */ 1965 ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2005 if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks) 1966 rlen = min(rs->rs_free, len);
2006 rg_mblk_search(rgd, ip); 1967 rs->rs_free -= rlen;
1968 rgd->rd_reserved -= rlen;
1969 trace_gfs2_rs(rs, TRACE_RS_CLAIM);
1970 if (rs->rs_free && !ret)
1971 goto out;
1972 }
1973 __rs_deltree(ip, rs);
2007 } 1974 }
2008 return start_block; 1975out:
1976 spin_unlock(&rgd->rd_rsspin);
2009} 1977}
2010 1978
2011/** 1979/**
@@ -2024,47 +1992,40 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2024{ 1992{
2025 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1993 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2026 struct buffer_head *dibh; 1994 struct buffer_head *dibh;
2027 struct gfs2_rgrpd *rgd; 1995 struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
2028 unsigned int ndata; 1996 unsigned int ndata;
2029 u32 goal, blk; /* block, within the rgrp scope */ 1997 u64 goal;
2030 u64 block; /* block, within the file system scope */ 1998 u64 block; /* block, within the file system scope */
2031 int error; 1999 int error;
2032 struct gfs2_bitmap *bi;
2033 2000
2034 /* Only happens if there is a bug in gfs2, return something distinctive 2001 if (gfs2_rs_active(ip->i_res))
2035 * to ensure that it is noticed. 2002 goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
2036 */ 2003 else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal))
2037 if (ip->i_res->rs_requested == 0) 2004 goal = ip->i_goal;
2038 return -ECANCELED; 2005 else
2039 2006 goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0;
2040 /* Check if we have a multi-block reservation, and if so, claim the
2041 next free block from it. */
2042 if (gfs2_rs_active(ip->i_res)) {
2043 BUG_ON(!ip->i_res->rs_free);
2044 rgd = ip->i_res->rs_rgd;
2045 block = claim_reserved_blks(ip, dinode, nblocks);
2046 } else {
2047 rgd = ip->i_rgd;
2048 2007
2049 if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) 2008 gfs2_rbm_from_block(&rbm, goal);
2050 goal = ip->i_goal - rgd->rd_data0; 2009 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
2051 else
2052 goal = rgd->rd_last_alloc;
2053
2054 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
2055
2056 /* Since all blocks are reserved in advance, this shouldn't
2057 happen */
2058 if (blk == BFITNOENT) {
2059 printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
2060 *nblocks);
2061 printk(KERN_WARNING "FULL=%d\n",
2062 test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
2063 goto rgrp_error;
2064 }
2065 2010
2066 block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); 2011 if (error == -ENOSPC) {
2012 gfs2_rbm_from_block(&rbm, goal);
2013 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
2014 }
2015
2016 /* Since all blocks are reserved in advance, this shouldn't happen */
2017 if (error) {
2018 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
2019 (unsigned long long)ip->i_no_addr, error, *nblocks,
2020 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
2021 goto rgrp_error;
2067 } 2022 }
2023
2024 gfs2_alloc_extent(&rbm, dinode, nblocks);
2025 block = gfs2_rbm_to_block(&rbm);
2026 rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2027 if (gfs2_rs_active(ip->i_res))
2028 gfs2_adjust_reservation(ip, &rbm, *nblocks);
2068 ndata = *nblocks; 2029 ndata = *nblocks;
2069 if (dinode) 2030 if (dinode)
2070 ndata--; 2031 ndata--;
@@ -2081,22 +2042,22 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2081 brelse(dibh); 2042 brelse(dibh);
2082 } 2043 }
2083 } 2044 }
2084 if (rgd->rd_free < *nblocks) { 2045 if (rbm.rgd->rd_free < *nblocks) {
2085 printk(KERN_WARNING "nblocks=%u\n", *nblocks); 2046 printk(KERN_WARNING "nblocks=%u\n", *nblocks);
2086 goto rgrp_error; 2047 goto rgrp_error;
2087 } 2048 }
2088 2049
2089 rgd->rd_free -= *nblocks; 2050 rbm.rgd->rd_free -= *nblocks;
2090 if (dinode) { 2051 if (dinode) {
2091 rgd->rd_dinodes++; 2052 rbm.rgd->rd_dinodes++;
2092 *generation = rgd->rd_igeneration++; 2053 *generation = rbm.rgd->rd_igeneration++;
2093 if (*generation == 0) 2054 if (*generation == 0)
2094 *generation = rgd->rd_igeneration++; 2055 *generation = rbm.rgd->rd_igeneration++;
2095 } 2056 }
2096 2057
2097 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 2058 gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
2098 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 2059 gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2099 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); 2060 gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
2100 2061
2101 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); 2062 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2102 if (dinode) 2063 if (dinode)
@@ -2110,14 +2071,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2110 gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, 2071 gfs2_quota_change(ip, ndata, ip->i_inode.i_uid,
2111 ip->i_inode.i_gid); 2072 ip->i_inode.i_gid);
2112 2073
2113 rgd->rd_free_clone -= *nblocks; 2074 rbm.rgd->rd_free_clone -= *nblocks;
2114 trace_gfs2_block_alloc(ip, rgd, block, *nblocks, 2075 trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2115 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); 2076 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2116 *bn = block; 2077 *bn = block;
2117 return 0; 2078 return 0;
2118 2079
2119rgrp_error: 2080rgrp_error:
2120 gfs2_rgrp_error(rgd); 2081 gfs2_rgrp_error(rbm.rgd);
2121 return -EIO; 2082 return -EIO;
2122} 2083}
2123 2084
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index ca6e26729b86..24077958dcf6 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -46,7 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
46 bool dinode, u64 *generation); 46 bool dinode, u64 *generation);
47 47
48extern int gfs2_rs_alloc(struct gfs2_inode *ip); 48extern int gfs2_rs_alloc(struct gfs2_inode *ip);
49extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); 49extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs);
50extern void gfs2_rs_delete(struct gfs2_inode *ip); 50extern void gfs2_rs_delete(struct gfs2_inode *ip);
51extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); 51extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
52extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 52extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -73,30 +73,10 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
73 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 73 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
74extern int gfs2_fitrim(struct file *filp, void __user *argp); 74extern int gfs2_fitrim(struct file *filp, void __user *argp);
75 75
76/* This is how to tell if a multi-block reservation is "inplace" reserved: */ 76/* This is how to tell if a reservation is in the rgrp tree: */
77static inline int gfs2_mb_reserved(struct gfs2_inode *ip) 77static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
78{ 78{
79 if (ip->i_res && ip->i_res->rs_requested) 79 return rs && !RB_EMPTY_NODE(&rs->rs_node);
80 return 1;
81 return 0;
82}
83
84/* This is how to tell if a multi-block reservation is in the rgrp tree: */
85static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
86{
87 if (rs && rs->rs_bi)
88 return 1;
89 return 0;
90}
91
92static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
93{
94 return (bi->bi_start * GFS2_NBBY) + blk;
95}
96
97static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
98{
99 return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
100} 80}
101 81
102#endif /* __RGRP_DOT_H__ */ 82#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index fc3168f47a14..bc737261f234 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1366,6 +1366,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1366 val = sdp->sd_tune.gt_statfs_quantum; 1366 val = sdp->sd_tune.gt_statfs_quantum;
1367 if (val != 30) 1367 if (val != 30)
1368 seq_printf(s, ",statfs_quantum=%d", val); 1368 seq_printf(s, ",statfs_quantum=%d", val);
1369 else if (sdp->sd_tune.gt_statfs_slow)
1370 seq_puts(s, ",statfs_quantum=0");
1369 val = sdp->sd_tune.gt_quota_quantum; 1371 val = sdp->sd_tune.gt_quota_quantum;
1370 if (val != 60) 1372 if (val != 60)
1371 seq_printf(s, ",quota_quantum=%d", val); 1373 seq_printf(s, ",quota_quantum=%d", val);
@@ -1543,6 +1545,11 @@ static void gfs2_evict_inode(struct inode *inode)
1543 1545
1544out_truncate: 1546out_truncate:
1545 gfs2_log_flush(sdp, ip->i_gl); 1547 gfs2_log_flush(sdp, ip->i_gl);
1548 if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
1549 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
1550 filemap_fdatawrite(metamapping);
1551 filemap_fdatawait(metamapping);
1552 }
1546 write_inode_now(inode, 1); 1553 write_inode_now(inode, 1);
1547 gfs2_ail_flush(ip->i_gl, 0); 1554 gfs2_ail_flush(ip->i_gl, 0);
1548 1555
@@ -1557,7 +1564,7 @@ out_truncate:
1557out_unlock: 1564out_unlock:
1558 /* Error path for case 1 */ 1565 /* Error path for case 1 */
1559 if (gfs2_rs_active(ip->i_res)) 1566 if (gfs2_rs_active(ip->i_res))
1560 gfs2_rs_deltree(ip->i_res); 1567 gfs2_rs_deltree(ip, ip->i_res);
1561 1568
1562 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) 1569 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
1563 gfs2_glock_dq(&ip->i_iopen_gh); 1570 gfs2_glock_dq(&ip->i_iopen_gh);
@@ -1572,7 +1579,7 @@ out:
1572 clear_inode(inode); 1579 clear_inode(inode);
1573 gfs2_dir_hash_inval(ip); 1580 gfs2_dir_hash_inval(ip);
1574 ip->i_gl->gl_object = NULL; 1581 ip->i_gl->gl_object = NULL;
1575 flush_delayed_work_sync(&ip->i_gl->gl_work); 1582 flush_delayed_work(&ip->i_gl->gl_work);
1576 gfs2_glock_add_to_lru(ip->i_gl); 1583 gfs2_glock_add_to_lru(ip->i_gl);
1577 gfs2_glock_put(ip->i_gl); 1584 gfs2_glock_put(ip->i_gl);
1578 ip->i_gl = NULL; 1585 ip->i_gl = NULL;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index a25c252fe412..bbdc78af60ca 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -509,10 +509,9 @@ TRACE_EVENT(gfs2_block_alloc,
509/* Keep track of multi-block reservations as they are allocated/freed */ 509/* Keep track of multi-block reservations as they are allocated/freed */
510TRACE_EVENT(gfs2_rs, 510TRACE_EVENT(gfs2_rs,
511 511
512 TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs, 512 TP_PROTO(const struct gfs2_blkreserv *rs, u8 func),
513 u8 func),
514 513
515 TP_ARGS(ip, rs, func), 514 TP_ARGS(rs, func),
516 515
517 TP_STRUCT__entry( 516 TP_STRUCT__entry(
518 __field( dev_t, dev ) 517 __field( dev_t, dev )
@@ -526,18 +525,17 @@ TRACE_EVENT(gfs2_rs,
526 ), 525 ),
527 526
528 TP_fast_assign( 527 TP_fast_assign(
529 __entry->dev = rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0; 528 __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev;
530 __entry->rd_addr = rs->rs_rgd ? rs->rs_rgd->rd_addr : 0; 529 __entry->rd_addr = rs->rs_rbm.rgd->rd_addr;
531 __entry->rd_free_clone = rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0; 530 __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone;
532 __entry->rd_reserved = rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0; 531 __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved;
533 __entry->inum = ip ? ip->i_no_addr : 0; 532 __entry->inum = rs->rs_inum;
534 __entry->start = gfs2_rs_startblk(rs); 533 __entry->start = gfs2_rbm_to_block(&rs->rs_rbm);
535 __entry->free = rs->rs_free; 534 __entry->free = rs->rs_free;
536 __entry->func = func; 535 __entry->func = func;
537 ), 536 ),
538 537
539 TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s " 538 TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu",
540 "f:%lu",
541 MAJOR(__entry->dev), MINOR(__entry->dev), 539 MAJOR(__entry->dev), MINOR(__entry->dev),
542 (unsigned long long)__entry->inum, 540 (unsigned long long)__entry->inum,
543 (unsigned long long)__entry->start, 541 (unsigned long long)__entry->start,
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 41f42cdccbb8..bf2ae9aeee7a 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,11 +28,10 @@ struct gfs2_glock;
28 28
29/* reserve either the number of blocks to be allocated plus the rg header 29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */ 30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) 31static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested)
32{ 32{
33 const struct gfs2_blkreserv *rs = ip->i_res; 33 if (requested < ip->i_rgd->rd_length)
34 if (rs && rs->rs_requested < ip->i_rgd->rd_length) 34 return requested + 1;
35 return rs->rs_requested + 1;
36 return ip->i_rgd->rd_length; 35 return ip->i_rgd->rd_length;
37} 36}
38 37
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 27a0b4a901f5..db330e5518cd 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -448,17 +448,18 @@ ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
448} 448}
449 449
450/** 450/**
451 * ea_get_unstuffed - actually copies the unstuffed data into the 451 * ea_iter_unstuffed - copies the unstuffed xattr data to/from the
452 * request buffer 452 * request buffer
453 * @ip: The GFS2 inode 453 * @ip: The GFS2 inode
454 * @ea: The extended attribute header structure 454 * @ea: The extended attribute header structure
455 * @data: The data to be copied 455 * @din: The data to be copied in
456 * @dout: The data to be copied out (one of din,dout will be NULL)
456 * 457 *
457 * Returns: errno 458 * Returns: errno
458 */ 459 */
459 460
460static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, 461static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
461 char *data) 462 const char *din, char *dout)
462{ 463{
463 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 464 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
464 struct buffer_head **bh; 465 struct buffer_head **bh;
@@ -467,6 +468,8 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
467 __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); 468 __be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
468 unsigned int x; 469 unsigned int x;
469 int error = 0; 470 int error = 0;
471 unsigned char *pos;
472 unsigned cp_size;
470 473
471 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); 474 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
472 if (!bh) 475 if (!bh)
@@ -497,12 +500,21 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
497 goto out; 500 goto out;
498 } 501 }
499 502
500 memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header), 503 pos = bh[x]->b_data + sizeof(struct gfs2_meta_header);
501 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); 504 cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize;
502 505
503 amount -= sdp->sd_jbsize; 506 if (dout) {
504 data += sdp->sd_jbsize; 507 memcpy(dout, pos, cp_size);
508 dout += sdp->sd_jbsize;
509 }
510
511 if (din) {
512 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
513 memcpy(pos, din, cp_size);
514 din += sdp->sd_jbsize;
515 }
505 516
517 amount -= sdp->sd_jbsize;
506 brelse(bh[x]); 518 brelse(bh[x]);
507 } 519 }
508 520
@@ -523,7 +535,7 @@ static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
523 memcpy(data, GFS2_EA2DATA(el->el_ea), len); 535 memcpy(data, GFS2_EA2DATA(el->el_ea), len);
524 return len; 536 return len;
525 } 537 }
526 ret = ea_get_unstuffed(ip, el->el_ea, data); 538 ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data);
527 if (ret < 0) 539 if (ret < 0)
528 return ret; 540 return ret;
529 return len; 541 return len;
@@ -727,7 +739,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
727 goto out_gunlock_q; 739 goto out_gunlock_q;
728 740
729 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 741 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
730 blks + gfs2_rg_blocks(ip) + 742 blks + gfs2_rg_blocks(ip, blks) +
731 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 743 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
732 if (error) 744 if (error)
733 goto out_ipres; 745 goto out_ipres;
@@ -1220,69 +1232,23 @@ static int gfs2_xattr_set(struct dentry *dentry, const char *name,
1220 size, flags, type); 1232 size, flags, type);
1221} 1233}
1222 1234
1235
1223static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, 1236static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1224 struct gfs2_ea_header *ea, char *data) 1237 struct gfs2_ea_header *ea, char *data)
1225{ 1238{
1226 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1239 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1227 struct buffer_head **bh;
1228 unsigned int amount = GFS2_EA_DATA_LEN(ea); 1240 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1229 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); 1241 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1230 __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); 1242 int ret;
1231 unsigned int x;
1232 int error;
1233
1234 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
1235 if (!bh)
1236 return -ENOMEM;
1237
1238 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1239 if (error)
1240 goto out;
1241
1242 for (x = 0; x < nptrs; x++) {
1243 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
1244 bh + x);
1245 if (error) {
1246 while (x--)
1247 brelse(bh[x]);
1248 goto fail;
1249 }
1250 dataptrs++;
1251 }
1252
1253 for (x = 0; x < nptrs; x++) {
1254 error = gfs2_meta_wait(sdp, bh[x]);
1255 if (error) {
1256 for (; x < nptrs; x++)
1257 brelse(bh[x]);
1258 goto fail;
1259 }
1260 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1261 for (; x < nptrs; x++)
1262 brelse(bh[x]);
1263 error = -EIO;
1264 goto fail;
1265 }
1266
1267 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1268
1269 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
1270 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1271
1272 amount -= sdp->sd_jbsize;
1273 data += sdp->sd_jbsize;
1274
1275 brelse(bh[x]);
1276 }
1277 1243
1278out: 1244 ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1279 kfree(bh); 1245 if (ret)
1280 return error; 1246 return ret;
1281 1247
1282fail: 1248 ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
1283 gfs2_trans_end(sdp); 1249 gfs2_trans_end(sdp);
1284 kfree(bh); 1250
1285 return error; 1251 return ret;
1286} 1252}
1287 1253
1288int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1254int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 8275175acf6e..693df9fe52b2 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -134,8 +134,8 @@ struct hfs_sb_info {
134 permissions on all files */ 134 permissions on all files */
135 umode_t s_dir_umask; /* The umask applied to the 135 umode_t s_dir_umask; /* The umask applied to the
136 permissions on all dirs */ 136 permissions on all dirs */
137 uid_t s_uid; /* The uid of all files */ 137 kuid_t s_uid; /* The uid of all files */
138 gid_t s_gid; /* The gid of all files */ 138 kgid_t s_gid; /* The gid of all files */
139 139
140 int session, part; 140 int session, part;
141 struct nls_table *nls_io, *nls_disk; 141 struct nls_table *nls_io, *nls_disk;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index ee1bc55677f1..0b35903219bc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -594,9 +594,9 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
594 594
595 /* no uig/gid changes and limit which mode bits can be set */ 595 /* no uig/gid changes and limit which mode bits can be set */
596 if (((attr->ia_valid & ATTR_UID) && 596 if (((attr->ia_valid & ATTR_UID) &&
597 (attr->ia_uid != hsb->s_uid)) || 597 (!uid_eq(attr->ia_uid, hsb->s_uid))) ||
598 ((attr->ia_valid & ATTR_GID) && 598 ((attr->ia_valid & ATTR_GID) &&
599 (attr->ia_gid != hsb->s_gid)) || 599 (!gid_eq(attr->ia_gid, hsb->s_gid))) ||
600 ((attr->ia_valid & ATTR_MODE) && 600 ((attr->ia_valid & ATTR_MODE) &&
601 ((S_ISDIR(inode->i_mode) && 601 ((S_ISDIR(inode->i_mode) &&
602 (attr->ia_mode != inode->i_mode)) || 602 (attr->ia_mode != inode->i_mode)) ||
@@ -644,7 +644,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
644 644
645 /* sync the superblock to buffers */ 645 /* sync the superblock to buffers */
646 sb = inode->i_sb; 646 sb = inode->i_sb;
647 flush_delayed_work_sync(&HFS_SB(sb)->mdb_work); 647 flush_delayed_work(&HFS_SB(sb)->mdb_work);
648 /* .. finally sync the buffers to disk */ 648 /* .. finally sync the buffers to disk */
649 err = sync_blockdev(sb->s_bdev); 649 err = sync_blockdev(sb->s_bdev);
650 if (!ret) 650 if (!ret)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4eb873e0c07b..e93ddaadfd1e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -138,7 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
138 seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); 138 seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
139 if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) 139 if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
140 seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); 140 seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
141 seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid); 141 seq_printf(seq, ",uid=%u,gid=%u",
142 from_kuid_munged(&init_user_ns, sbi->s_uid),
143 from_kgid_munged(&init_user_ns, sbi->s_gid));
142 if (sbi->s_file_umask != 0133) 144 if (sbi->s_file_umask != 0133)
143 seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); 145 seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
144 if (sbi->s_dir_umask != 0022) 146 if (sbi->s_dir_umask != 0022)
@@ -254,14 +256,22 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
254 printk(KERN_ERR "hfs: uid requires an argument\n"); 256 printk(KERN_ERR "hfs: uid requires an argument\n");
255 return 0; 257 return 0;
256 } 258 }
257 hsb->s_uid = (uid_t)tmp; 259 hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
260 if (!uid_valid(hsb->s_uid)) {
261 printk(KERN_ERR "hfs: invalid uid %d\n", tmp);
262 return 0;
263 }
258 break; 264 break;
259 case opt_gid: 265 case opt_gid:
260 if (match_int(&args[0], &tmp)) { 266 if (match_int(&args[0], &tmp)) {
261 printk(KERN_ERR "hfs: gid requires an argument\n"); 267 printk(KERN_ERR "hfs: gid requires an argument\n");
262 return 0; 268 return 0;
263 } 269 }
264 hsb->s_gid = (gid_t)tmp; 270 hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
271 if (!gid_valid(hsb->s_gid)) {
272 printk(KERN_ERR "hfs: invalid gid %d\n", tmp);
273 return 0;
274 }
265 break; 275 break;
266 case opt_umask: 276 case opt_umask:
267 if (match_octal(&args[0], &tmp)) { 277 if (match_octal(&args[0], &tmp)) {
@@ -482,6 +492,12 @@ static int __init init_hfs_fs(void)
482static void __exit exit_hfs_fs(void) 492static void __exit exit_hfs_fs(void)
483{ 493{
484 unregister_filesystem(&hfs_fs_type); 494 unregister_filesystem(&hfs_fs_type);
495
496 /*
497 * Make sure all delayed rcu free inodes are flushed before we
498 * destroy cache.
499 */
500 rcu_barrier();
485 kmem_cache_destroy(hfs_inode_cachep); 501 kmem_cache_destroy(hfs_inode_cachep);
486} 502}
487 503
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index ec2a9c23f0c9..798d9c4c5e71 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -80,8 +80,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
80 80
81 perms->userflags = HFSPLUS_I(inode)->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(i_uid_read(inode));
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(i_gid_read(inode));
85 85
86 if (S_ISREG(inode->i_mode)) 86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink); 87 perms->dev = cpu_to_be32(inode->i_nlink);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 558dbb463a4e..c571de224b15 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -149,8 +149,8 @@ struct hfsplus_sb_info {
149 u32 type; 149 u32 type;
150 150
151 umode_t umask; 151 umode_t umask;
152 uid_t uid; 152 kuid_t uid;
153 gid_t gid; 153 kgid_t gid;
154 154
155 int part, session; 155 int part, session;
156 unsigned long flags; 156 unsigned long flags;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 3d8b4a675ba0..2172aa5976f5 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -233,12 +233,12 @@ static void hfsplus_get_perms(struct inode *inode,
233 233
234 mode = be16_to_cpu(perms->mode); 234 mode = be16_to_cpu(perms->mode);
235 235
236 inode->i_uid = be32_to_cpu(perms->owner); 236 i_uid_write(inode, be32_to_cpu(perms->owner));
237 if (!inode->i_uid && !mode) 237 if (!i_uid_read(inode) && !mode)
238 inode->i_uid = sbi->uid; 238 inode->i_uid = sbi->uid;
239 239
240 inode->i_gid = be32_to_cpu(perms->group); 240 i_gid_write(inode, be32_to_cpu(perms->group));
241 if (!inode->i_gid && !mode) 241 if (!i_gid_read(inode) && !mode)
242 inode->i_gid = sbi->gid; 242 inode->i_gid = sbi->gid;
243 243
244 if (dir) { 244 if (dir) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 06fa5618600c..ed257c671615 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -135,14 +135,22 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
135 printk(KERN_ERR "hfs: uid requires an argument\n"); 135 printk(KERN_ERR "hfs: uid requires an argument\n");
136 return 0; 136 return 0;
137 } 137 }
138 sbi->uid = (uid_t)tmp; 138 sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
139 if (!uid_valid(sbi->uid)) {
140 printk(KERN_ERR "hfs: invalid uid specified\n");
141 return 0;
142 }
139 break; 143 break;
140 case opt_gid: 144 case opt_gid:
141 if (match_int(&args[0], &tmp)) { 145 if (match_int(&args[0], &tmp)) {
142 printk(KERN_ERR "hfs: gid requires an argument\n"); 146 printk(KERN_ERR "hfs: gid requires an argument\n");
143 return 0; 147 return 0;
144 } 148 }
145 sbi->gid = (gid_t)tmp; 149 sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
150 if (!gid_valid(sbi->gid)) {
151 printk(KERN_ERR "hfs: invalid gid specified\n");
152 return 0;
153 }
146 break; 154 break;
147 case opt_part: 155 case opt_part:
148 if (match_int(&args[0], &sbi->part)) { 156 if (match_int(&args[0], &sbi->part)) {
@@ -215,7 +223,8 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
215 if (sbi->type != HFSPLUS_DEF_CR_TYPE) 223 if (sbi->type != HFSPLUS_DEF_CR_TYPE)
216 seq_printf(seq, ",type=%.4s", (char *)&sbi->type); 224 seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
217 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, 225 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
218 sbi->uid, sbi->gid); 226 from_kuid_munged(&init_user_ns, sbi->uid),
227 from_kgid_munged(&init_user_ns, sbi->gid));
219 if (sbi->part >= 0) 228 if (sbi->part >= 0)
220 seq_printf(seq, ",part=%u", sbi->part); 229 seq_printf(seq, ",part=%u", sbi->part);
221 if (sbi->session >= 0) 230 if (sbi->session >= 0)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index fdafb2d71654..811a84d2d964 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -635,6 +635,12 @@ static int __init init_hfsplus_fs(void)
635static void __exit exit_hfsplus_fs(void) 635static void __exit exit_hfsplus_fs(void)
636{ 636{
637 unregister_filesystem(&hfsplus_fs_type); 637 unregister_filesystem(&hfsplus_fs_type);
638
639 /*
640 * Make sure all delayed rcu free inodes are flushed before we
641 * destroy cache.
642 */
643 rcu_barrier();
638 kmem_cache_destroy(hfsplus_inode_cachep); 644 kmem_cache_destroy(hfsplus_inode_cachep);
639} 645}
640 646
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 1fe731337f07..9c88da0e855a 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -1,7 +1,7 @@
1#ifndef __UM_FS_HOSTFS 1#ifndef __UM_FS_HOSTFS
2#define __UM_FS_HOSTFS 2#define __UM_FS_HOSTFS
3 3
4#include "os.h" 4#include <os.h>
5 5
6/* 6/*
7 * These are exactly the same definitions as in fs.h, but the names are 7 * These are exactly the same definitions as in fs.h, but the names are
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 124146543aa7..457addc5c91f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -16,8 +16,8 @@
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include "hostfs.h" 18#include "hostfs.h"
19#include "init.h" 19#include <init.h>
20#include "kern.h" 20#include <kern.h>
21 21
22struct hostfs_inode_info { 22struct hostfs_inode_info {
23 int fd; 23 int fd;
@@ -542,8 +542,8 @@ static int read_name(struct inode *ino, char *name)
542 ino->i_ino = st.ino; 542 ino->i_ino = st.ino;
543 ino->i_mode = st.mode; 543 ino->i_mode = st.mode;
544 set_nlink(ino, st.nlink); 544 set_nlink(ino, st.nlink);
545 ino->i_uid = st.uid; 545 i_uid_write(ino, st.uid);
546 ino->i_gid = st.gid; 546 i_gid_write(ino, st.gid);
547 ino->i_atime = st.atime; 547 ino->i_atime = st.atime;
548 ino->i_mtime = st.mtime; 548 ino->i_mtime = st.mtime;
549 ino->i_ctime = st.ctime; 549 ino->i_ctime = st.ctime;
@@ -808,11 +808,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
808 } 808 }
809 if (attr->ia_valid & ATTR_UID) { 809 if (attr->ia_valid & ATTR_UID) {
810 attrs.ia_valid |= HOSTFS_ATTR_UID; 810 attrs.ia_valid |= HOSTFS_ATTR_UID;
811 attrs.ia_uid = attr->ia_uid; 811 attrs.ia_uid = from_kuid(&init_user_ns, attr->ia_uid);
812 } 812 }
813 if (attr->ia_valid & ATTR_GID) { 813 if (attr->ia_valid & ATTR_GID) {
814 attrs.ia_valid |= HOSTFS_ATTR_GID; 814 attrs.ia_valid |= HOSTFS_ATTR_GID;
815 attrs.ia_gid = attr->ia_gid; 815 attrs.ia_gid = from_kgid(&init_user_ns, attr->ia_gid);
816 } 816 }
817 if (attr->ia_valid & ATTR_SIZE) { 817 if (attr->ia_valid & ATTR_SIZE) {
818 attrs.ia_valid |= HOSTFS_ATTR_SIZE; 818 attrs.ia_valid |= HOSTFS_ATTR_SIZE;
@@ -848,9 +848,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
848 attr->ia_size != i_size_read(inode)) { 848 attr->ia_size != i_size_read(inode)) {
849 int error; 849 int error;
850 850
851 error = vmtruncate(inode, attr->ia_size); 851 error = inode_newsize_ok(inode, attr->ia_size);
852 if (err) 852 if (error)
853 return err; 853 return error;
854
855 truncate_setsize(inode, attr->ia_size);
854 } 856 }
855 857
856 setattr_copy(inode, attr); 858 setattr_copy(inode, attr);
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index a74ad0d371c2..67838f3aa20a 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -15,7 +15,6 @@
15#include <sys/types.h> 15#include <sys/types.h>
16#include <sys/vfs.h> 16#include <sys/vfs.h>
17#include "hostfs.h" 17#include "hostfs.h"
18#include "os.h"
19#include <utime.h> 18#include <utime.h>
20 19
21static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) 20static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 4bae4a4a60b1..2d5b254ad9e2 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -102,7 +102,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
102 return -1; 102 return -1;
103 } 103 }
104 if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) { 104 if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) {
105 btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1); 105 le32_add_cpu(&btree->u.external[n].length, 1);
106 mark_buffer_dirty(bh); 106 mark_buffer_dirty(bh);
107 brelse(bh); 107 brelse(bh);
108 return se; 108 return se;
@@ -153,7 +153,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
153 btree = &anode->btree; 153 btree = &anode->btree;
154 } 154 }
155 btree->n_free_nodes--; n = btree->n_used_nodes++; 155 btree->n_free_nodes--; n = btree->n_used_nodes++;
156 btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12); 156 le16_add_cpu(&btree->first_free, 12);
157 btree->u.external[n].disk_secno = cpu_to_le32(se); 157 btree->u.external[n].disk_secno = cpu_to_le32(se);
158 btree->u.external[n].file_secno = cpu_to_le32(fs); 158 btree->u.external[n].file_secno = cpu_to_le32(fs);
159 btree->u.external[n].length = cpu_to_le32(1); 159 btree->u.external[n].length = cpu_to_le32(1);
@@ -174,7 +174,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
174 } 174 }
175 if (btree->n_free_nodes) { 175 if (btree->n_free_nodes) {
176 btree->n_free_nodes--; n = btree->n_used_nodes++; 176 btree->n_free_nodes--; n = btree->n_used_nodes++;
177 btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8); 177 le16_add_cpu(&btree->first_free, 8);
178 btree->u.internal[n].file_secno = cpu_to_le32(-1); 178 btree->u.internal[n].file_secno = cpu_to_le32(-1);
179 btree->u.internal[n].down = cpu_to_le32(na); 179 btree->u.internal[n].down = cpu_to_le32(na);
180 btree->u.internal[n-1].file_secno = cpu_to_le32(fs); 180 btree->u.internal[n-1].file_secno = cpu_to_le32(fs);
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 3228c524ebe5..4364b2a02c5d 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -145,10 +145,10 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
145 } 145 }
146 } 146 }
147 if (ptr) { 147 if (ptr) {
148 d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4); 148 le32_add_cpu(&d->first_free, 4);
149 if (le32_to_cpu(d->first_free) > 2048) { 149 if (le32_to_cpu(d->first_free) > 2048) {
150 hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); 150 hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self));
151 d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4); 151 le32_add_cpu(&d->first_free, -4);
152 return; 152 return;
153 } 153 }
154 de->length = cpu_to_le16(36); 154 de->length = cpu_to_le16(36);
@@ -184,7 +184,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
184 de->not_8x3 = hpfs_is_name_long(name, namelen); 184 de->not_8x3 = hpfs_is_name_long(name, namelen);
185 de->namelen = namelen; 185 de->namelen = namelen;
186 memcpy(de->name, name, namelen); 186 memcpy(de->name, name, namelen);
187 d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size); 187 le32_add_cpu(&d->first_free, d_size);
188 return de; 188 return de;
189} 189}
190 190
@@ -314,7 +314,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); 314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
315 de = de_next_de(de); 315 de = de_next_de(de);
316 memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); 316 memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de);
317 nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20)); 317 le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20));
318 memcpy(d, nd, le32_to_cpu(nd->first_free)); 318 memcpy(d, nd, le32_to_cpu(nd->first_free));
319 for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); 319 for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos);
320 fix_up_ptrs(i->i_sb, ad); 320 fix_up_ptrs(i->i_sb, ad);
@@ -474,8 +474,8 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to)
474 hpfs_brelse4(&qbh); 474 hpfs_brelse4(&qbh);
475 return 0; 475 return 0;
476 } 476 }
477 dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); 477 le32_add_cpu(&dnode->first_free, -4);
478 de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); 478 le16_add_cpu(&de->length, -4);
479 de->down = 0; 479 de->down = 0;
480 hpfs_mark_4buffers_dirty(&qbh); 480 hpfs_mark_4buffers_dirty(&qbh);
481 dno = up; 481 dno = up;
@@ -570,8 +570,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
570 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); 570 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p);
571 if (!down) { 571 if (!down) {
572 de->down = 0; 572 de->down = 0;
573 de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); 573 le16_add_cpu(&de->length, -4);
574 dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); 574 le32_add_cpu(&dnode->first_free, -4);
575 memmove(de_next_de(de), (char *)de_next_de(de) + 4, 575 memmove(de_next_de(de), (char *)de_next_de(de) + 4,
576 (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); 576 (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de));
577 } else { 577 } else {
@@ -647,14 +647,14 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
647 printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); 647 printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
648 printk("HPFS: warning: goin'on\n"); 648 printk("HPFS: warning: goin'on\n");
649 } 649 }
650 del->length = cpu_to_le16(le16_to_cpu(del->length) + 4); 650 le16_add_cpu(&del->length, 4);
651 del->down = 1; 651 del->down = 1;
652 d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4); 652 le32_add_cpu(&d1->first_free, 4);
653 } 653 }
654 if (dlp && !down) { 654 if (dlp && !down) {
655 del->length = cpu_to_le16(le16_to_cpu(del->length) - 4); 655 le16_add_cpu(&del->length, -4);
656 del->down = 0; 656 del->down = 0;
657 d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); 657 le32_add_cpu(&d1->first_free, -4);
658 } else if (down) 658 } else if (down)
659 *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); 659 *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
660 } else goto endm; 660 } else goto endm;
@@ -668,9 +668,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
668 memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); 668 memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length));
669 hpfs_delete_de(i->i_sb, dnode, de_prev); 669 hpfs_delete_de(i->i_sb, dnode, de_prev);
670 if (!de_prev->down) { 670 if (!de_prev->down) {
671 de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4); 671 le16_add_cpu(&de_prev->length, 4);
672 de_prev->down = 1; 672 de_prev->down = 1;
673 dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); 673 le32_add_cpu(&dnode->first_free, 4);
674 } 674 }
675 *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); 675 *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
676 hpfs_mark_4buffers_dirty(&qbh); 676 hpfs_mark_4buffers_dirty(&qbh);
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index ac1ead194db5..7102aaecc244 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -63,8 +63,8 @@ struct hpfs_sb_info {
63 unsigned sb_dmap; /* sector number of dnode bit map */ 63 unsigned sb_dmap; /* sector number of dnode bit map */
64 unsigned sb_n_free; /* free blocks for statfs, or -1 */ 64 unsigned sb_n_free; /* free blocks for statfs, or -1 */
65 unsigned sb_n_free_dnodes; /* free dnodes for statfs, or -1 */ 65 unsigned sb_n_free_dnodes; /* free dnodes for statfs, or -1 */
66 uid_t sb_uid; /* uid from mount options */ 66 kuid_t sb_uid; /* uid from mount options */
67 gid_t sb_gid; /* gid from mount options */ 67 kgid_t sb_gid; /* gid from mount options */
68 umode_t sb_mode; /* mode from mount options */ 68 umode_t sb_mode; /* mode from mount options */
69 unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */ 69 unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */
70 unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */ 70 unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ed671e0ea784..804a9a842cbc 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/user_namespace.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
@@ -60,14 +61,14 @@ void hpfs_read_inode(struct inode *i)
60 if (hpfs_sb(i->i_sb)->sb_eas) { 61 if (hpfs_sb(i->i_sb)->sb_eas) {
61 if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) { 62 if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
62 if (ea_size == 2) { 63 if (ea_size == 2) {
63 i->i_uid = le16_to_cpu(*(__le16*)ea); 64 i_uid_write(i, le16_to_cpu(*(__le16*)ea));
64 hpfs_inode->i_ea_uid = 1; 65 hpfs_inode->i_ea_uid = 1;
65 } 66 }
66 kfree(ea); 67 kfree(ea);
67 } 68 }
68 if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) { 69 if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
69 if (ea_size == 2) { 70 if (ea_size == 2) {
70 i->i_gid = le16_to_cpu(*(__le16*)ea); 71 i_gid_write(i, le16_to_cpu(*(__le16*)ea));
71 hpfs_inode->i_ea_gid = 1; 72 hpfs_inode->i_ea_gid = 1;
72 } 73 }
73 kfree(ea); 74 kfree(ea);
@@ -149,13 +150,13 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
149 hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); 150 hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
150 } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { 151 } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
151 __le32 ea; 152 __le32 ea;
152 if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { 153 if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
153 ea = cpu_to_le32(i->i_uid); 154 ea = cpu_to_le32(i_uid_read(i));
154 hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2); 155 hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
155 hpfs_inode->i_ea_uid = 1; 156 hpfs_inode->i_ea_uid = 1;
156 } 157 }
157 if ((i->i_gid != hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) { 158 if (!gid_eq(i->i_gid, hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) {
158 ea = cpu_to_le32(i->i_gid); 159 ea = cpu_to_le32(i_gid_read(i));
159 hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2); 160 hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2);
160 hpfs_inode->i_ea_gid = 1; 161 hpfs_inode->i_ea_gid = 1;
161 } 162 }
@@ -261,9 +262,11 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
261 hpfs_lock(inode->i_sb); 262 hpfs_lock(inode->i_sb);
262 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) 263 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
263 goto out_unlock; 264 goto out_unlock;
264 if ((attr->ia_valid & ATTR_UID) && attr->ia_uid >= 0x10000) 265 if ((attr->ia_valid & ATTR_UID) &&
266 from_kuid(&init_user_ns, attr->ia_uid) >= 0x10000)
265 goto out_unlock; 267 goto out_unlock;
266 if ((attr->ia_valid & ATTR_GID) && attr->ia_gid >= 0x10000) 268 if ((attr->ia_valid & ATTR_GID) &&
269 from_kgid(&init_user_ns, attr->ia_gid) >= 0x10000)
267 goto out_unlock; 270 goto out_unlock;
268 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) 271 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
269 goto out_unlock; 272 goto out_unlock;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index bc9082482f68..345713d2f8f3 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -91,8 +91,8 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
91 inc_nlink(dir); 91 inc_nlink(dir);
92 insert_inode_hash(result); 92 insert_inode_hash(result);
93 93
94 if (result->i_uid != current_fsuid() || 94 if (!uid_eq(result->i_uid, current_fsuid()) ||
95 result->i_gid != current_fsgid() || 95 !gid_eq(result->i_gid, current_fsgid()) ||
96 result->i_mode != (mode | S_IFDIR)) { 96 result->i_mode != (mode | S_IFDIR)) {
97 result->i_uid = current_fsuid(); 97 result->i_uid = current_fsuid();
98 result->i_gid = current_fsgid(); 98 result->i_gid = current_fsgid();
@@ -179,8 +179,8 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b
179 179
180 insert_inode_hash(result); 180 insert_inode_hash(result);
181 181
182 if (result->i_uid != current_fsuid() || 182 if (!uid_eq(result->i_uid, current_fsuid()) ||
183 result->i_gid != current_fsgid() || 183 !gid_eq(result->i_gid, current_fsgid()) ||
184 result->i_mode != (mode | S_IFREG)) { 184 result->i_mode != (mode | S_IFREG)) {
185 result->i_uid = current_fsuid(); 185 result->i_uid = current_fsuid();
186 result->i_gid = current_fsgid(); 186 result->i_gid = current_fsgid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 706a12c083ea..a3076228523d 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -210,6 +210,11 @@ static int init_inodecache(void)
210 210
211static void destroy_inodecache(void) 211static void destroy_inodecache(void)
212{ 212{
213 /*
214 * Make sure all delayed rcu free inodes are flushed before we
215 * destroy cache.
216 */
217 rcu_barrier();
213 kmem_cache_destroy(hpfs_inode_cachep); 218 kmem_cache_destroy(hpfs_inode_cachep);
214} 219}
215 220
@@ -251,7 +256,7 @@ static const match_table_t tokens = {
251 {Opt_err, NULL}, 256 {Opt_err, NULL},
252}; 257};
253 258
254static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, 259static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
255 int *lowercase, int *eas, int *chk, int *errs, 260 int *lowercase, int *eas, int *chk, int *errs,
256 int *chkdsk, int *timeshift) 261 int *chkdsk, int *timeshift)
257{ 262{
@@ -276,12 +281,16 @@ static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask,
276 case Opt_uid: 281 case Opt_uid:
277 if (match_int(args, &option)) 282 if (match_int(args, &option))
278 return 0; 283 return 0;
279 *uid = option; 284 *uid = make_kuid(current_user_ns(), option);
285 if (!uid_valid(*uid))
286 return 0;
280 break; 287 break;
281 case Opt_gid: 288 case Opt_gid:
282 if (match_int(args, &option)) 289 if (match_int(args, &option))
283 return 0; 290 return 0;
284 *gid = option; 291 *gid = make_kgid(current_user_ns(), option);
292 if (!gid_valid(*gid))
293 return 0;
285 break; 294 break;
286 case Opt_umask: 295 case Opt_umask:
287 if (match_octal(args, &option)) 296 if (match_octal(args, &option))
@@ -378,8 +387,8 @@ HPFS filesystem options:\n\
378 387
379static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) 388static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
380{ 389{
381 uid_t uid; 390 kuid_t uid;
382 gid_t gid; 391 kgid_t gid;
383 umode_t umask; 392 umode_t umask;
384 int lowercase, eas, chk, errs, chkdsk, timeshift; 393 int lowercase, eas, chk, errs, chkdsk, timeshift;
385 int o; 394 int o;
@@ -389,7 +398,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
389 *flags |= MS_NOATIME; 398 *flags |= MS_NOATIME;
390 399
391 hpfs_lock(s); 400 hpfs_lock(s);
392 lock_super(s);
393 uid = sbi->sb_uid; gid = sbi->sb_gid; 401 uid = sbi->sb_uid; gid = sbi->sb_gid;
394 umask = 0777 & ~sbi->sb_mode; 402 umask = 0777 & ~sbi->sb_mode;
395 lowercase = sbi->sb_lowercase; 403 lowercase = sbi->sb_lowercase;
@@ -422,12 +430,10 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
422 430
423 replace_mount_options(s, new_opts); 431 replace_mount_options(s, new_opts);
424 432
425 unlock_super(s);
426 hpfs_unlock(s); 433 hpfs_unlock(s);
427 return 0; 434 return 0;
428 435
429out_err: 436out_err:
430 unlock_super(s);
431 hpfs_unlock(s); 437 hpfs_unlock(s);
432 kfree(new_opts); 438 kfree(new_opts);
433 return -EINVAL; 439 return -EINVAL;
@@ -455,8 +461,8 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
455 struct hpfs_sb_info *sbi; 461 struct hpfs_sb_info *sbi;
456 struct inode *root; 462 struct inode *root;
457 463
458 uid_t uid; 464 kuid_t uid;
459 gid_t gid; 465 kgid_t gid;
460 umode_t umask; 466 umode_t umask;
461 int lowercase, eas, chk, errs, chkdsk, timeshift; 467 int lowercase, eas, chk, errs, chkdsk, timeshift;
462 468
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index c1dffe47fde2..78f21f8dc2ec 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -18,7 +18,7 @@
18#include <linux/pid_namespace.h> 18#include <linux/pid_namespace.h>
19#include <linux/namei.h> 19#include <linux/namei.h>
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21#include "os.h" 21#include <os.h>
22 22
23static struct inode *get_inode(struct super_block *, struct dentry *); 23static struct inode *get_inode(struct super_block *, struct dentry *);
24 24
@@ -674,7 +674,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
674 674
675 if (!inode) { 675 if (!inode) {
676 dput(dentry); 676 dput(dentry);
677 return ERR_PTR(-ENOMEM); 677 return NULL;
678 } 678 }
679 679
680 if (S_ISDIR(dentry->d_inode->i_mode)) { 680 if (S_ISDIR(dentry->d_inode->i_mode)) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8349a899912e..c5bc355d8243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -42,8 +42,8 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
42static const struct inode_operations hugetlbfs_inode_operations; 42static const struct inode_operations hugetlbfs_inode_operations;
43 43
44struct hugetlbfs_config { 44struct hugetlbfs_config {
45 uid_t uid; 45 kuid_t uid;
46 gid_t gid; 46 kgid_t gid;
47 umode_t mode; 47 umode_t mode;
48 long nr_blocks; 48 long nr_blocks;
49 long nr_inodes; 49 long nr_inodes;
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
110 * way when do_mmap_pgoff unwinds (may be important on powerpc 110 * way when do_mmap_pgoff unwinds (may be important on powerpc
111 * and ia64). 111 * and ia64).
112 */ 112 */
113 vma->vm_flags |= VM_HUGETLB | VM_RESERVED; 113 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
114 vma->vm_ops = &hugetlb_vm_ops; 114 vma->vm_ops = &hugetlb_vm_ops;
115 115
116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode)
397} 397}
398 398
399static inline void 399static inline void
400hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) 400hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
401{ 401{
402 struct vm_area_struct *vma; 402 struct vm_area_struct *vma;
403 struct prio_tree_iter iter;
404 403
405 vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { 404 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
406 unsigned long v_offset; 405 unsigned long v_offset;
407 406
408 /* 407 /*
409 * Can the expression below overflow on 32-bit arches? 408 * Can the expression below overflow on 32-bit arches?
410 * No, because the prio_tree returns us only those vmas 409 * No, because the interval tree returns us only those vmas
411 * which overlap the truncated area starting at pgoff, 410 * which overlap the truncated area starting at pgoff,
412 * and no vma on a 32-bit arch can span beyond the 4GB. 411 * and no vma on a 32-bit arch can span beyond the 4GB.
413 */ 412 */
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
432 431
433 i_size_write(inode, offset); 432 i_size_write(inode, offset);
434 mutex_lock(&mapping->i_mmap_mutex); 433 mutex_lock(&mapping->i_mmap_mutex);
435 if (!prio_tree_empty(&mapping->i_mmap)) 434 if (!RB_EMPTY_ROOT(&mapping->i_mmap))
436 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 435 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
437 mutex_unlock(&mapping->i_mmap_mutex); 436 mutex_unlock(&mapping->i_mmap_mutex);
438 truncate_hugepages(inode, offset); 437 truncate_hugepages(inode, offset);
@@ -785,13 +784,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
785 case Opt_uid: 784 case Opt_uid:
786 if (match_int(&args[0], &option)) 785 if (match_int(&args[0], &option))
787 goto bad_val; 786 goto bad_val;
788 pconfig->uid = option; 787 pconfig->uid = make_kuid(current_user_ns(), option);
788 if (!uid_valid(pconfig->uid))
789 goto bad_val;
789 break; 790 break;
790 791
791 case Opt_gid: 792 case Opt_gid:
792 if (match_int(&args[0], &option)) 793 if (match_int(&args[0], &option))
793 goto bad_val; 794 goto bad_val;
794 pconfig->gid = option; 795 pconfig->gid = make_kgid(current_user_ns(), option);
796 if (!gid_valid(pconfig->gid))
797 goto bad_val;
795 break; 798 break;
796 799
797 case Opt_mode: 800 case Opt_mode:
@@ -924,7 +927,9 @@ static struct vfsmount *hugetlbfs_vfsmount;
924 927
925static int can_do_hugetlb_shm(void) 928static int can_do_hugetlb_shm(void)
926{ 929{
927 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 930 kgid_t shm_group;
931 shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
928} 933}
929 934
930struct file *hugetlb_file_setup(const char *name, unsigned long addr, 935struct file *hugetlb_file_setup(const char *name, unsigned long addr,
@@ -1042,6 +1047,11 @@ static int __init init_hugetlbfs_fs(void)
1042 1047
1043static void __exit exit_hugetlbfs_fs(void) 1048static void __exit exit_hugetlbfs_fs(void)
1044{ 1049{
1050 /*
1051 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache.
1053 */
1054 rcu_barrier();
1045 kmem_cache_destroy(hugetlbfs_inode_cachep); 1055 kmem_cache_destroy(hugetlbfs_inode_cachep);
1046 kern_unmount(hugetlbfs_vfsmount); 1056 kern_unmount(hugetlbfs_vfsmount);
1047 unregister_filesystem(&hugetlbfs_fs_type); 1057 unregister_filesystem(&hugetlbfs_fs_type);
diff --git a/fs/inode.c b/fs/inode.c
index ac8d904b3f16..b03c71957246 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping)
348 mutex_init(&mapping->i_mmap_mutex); 348 mutex_init(&mapping->i_mmap_mutex);
349 INIT_LIST_HEAD(&mapping->private_list); 349 INIT_LIST_HEAD(&mapping->private_list);
350 spin_lock_init(&mapping->private_lock); 350 spin_lock_init(&mapping->private_lock);
351 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); 351 mapping->i_mmap = RB_ROOT;
352 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); 352 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
353} 353}
354EXPORT_SYMBOL(address_space_init_once); 354EXPORT_SYMBOL(address_space_init_once);
diff --git a/fs/internal.h b/fs/internal.h
index 371bcc4b1697..916b7cbf3e3e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -97,8 +97,8 @@ struct open_flags {
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99}; 99};
100extern struct file *do_filp_open(int dfd, const char *pathname, 100extern struct file *do_filp_open(int dfd, struct filename *pathname,
101 const struct open_flags *op, int lookup_flags); 101 const struct open_flags *op, int flags);
102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
103 const char *, const struct open_flags *, int lookup_flags); 103 const char *, const struct open_flags *, int lookup_flags);
104 104
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29167bebe874..3bdad6d1f268 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -603,21 +603,14 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
603 603
604SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 604SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
605{ 605{
606 struct file *filp; 606 int error;
607 int error = -EBADF; 607 struct fd f = fdget(fd);
608 int fput_needed; 608
609 609 if (!f.file)
610 filp = fget_light(fd, &fput_needed); 610 return -EBADF;
611 if (!filp) 611 error = security_file_ioctl(f.file, cmd, arg);
612 goto out; 612 if (!error)
613 613 error = do_vfs_ioctl(f.file, fd, cmd, arg);
614 error = security_file_ioctl(filp, cmd, arg); 614 fdput(f);
615 if (error)
616 goto out_fput;
617
618 error = do_vfs_ioctl(filp, fd, cmd, arg);
619 out_fput:
620 fput_light(filp, fput_needed);
621 out:
622 return error; 615 return error;
623} 616}
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 1d3804492aa7..2b4f2358eadb 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -175,7 +175,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
175{ 175{
176 struct isofs_fid *ifid = (struct isofs_fid *)fid; 176 struct isofs_fid *ifid = (struct isofs_fid *)fid;
177 177
178 if (fh_type != 2) 178 if (fh_len < 2 || fh_type != 2)
179 return NULL; 179 return NULL;
180 180
181 return isofs_export_iget(sb, 181 return isofs_export_iget(sb,
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 29037c365ba4..67ce52507d7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -21,6 +21,7 @@
21#include <linux/cdrom.h> 21#include <linux/cdrom.h>
22#include <linux/parser.h> 22#include <linux/parser.h>
23#include <linux/mpage.h> 23#include <linux/mpage.h>
24#include <linux/user_namespace.h>
24 25
25#include "isofs.h" 26#include "isofs.h"
26#include "zisofs.h" 27#include "zisofs.h"
@@ -114,6 +115,11 @@ static int init_inodecache(void)
114 115
115static void destroy_inodecache(void) 116static void destroy_inodecache(void)
116{ 117{
118 /*
119 * Make sure all delayed rcu free inodes are flushed before we
120 * destroy cache.
121 */
122 rcu_barrier();
117 kmem_cache_destroy(isofs_inode_cachep); 123 kmem_cache_destroy(isofs_inode_cachep);
118} 124}
119 125
@@ -171,8 +177,8 @@ struct iso9660_options{
171 unsigned int blocksize; 177 unsigned int blocksize;
172 umode_t fmode; 178 umode_t fmode;
173 umode_t dmode; 179 umode_t dmode;
174 gid_t gid; 180 kgid_t gid;
175 uid_t uid; 181 kuid_t uid;
176 char *iocharset; 182 char *iocharset;
177 /* LVE */ 183 /* LVE */
178 s32 session; 184 s32 session;
@@ -383,8 +389,8 @@ static int parse_options(char *options, struct iso9660_options *popt)
383 popt->fmode = popt->dmode = ISOFS_INVALID_MODE; 389 popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
384 popt->uid_set = 0; 390 popt->uid_set = 0;
385 popt->gid_set = 0; 391 popt->gid_set = 0;
386 popt->gid = 0; 392 popt->gid = GLOBAL_ROOT_GID;
387 popt->uid = 0; 393 popt->uid = GLOBAL_ROOT_UID;
388 popt->iocharset = NULL; 394 popt->iocharset = NULL;
389 popt->utf8 = 0; 395 popt->utf8 = 0;
390 popt->overriderockperm = 0; 396 popt->overriderockperm = 0;
@@ -460,13 +466,17 @@ static int parse_options(char *options, struct iso9660_options *popt)
460 case Opt_uid: 466 case Opt_uid:
461 if (match_int(&args[0], &option)) 467 if (match_int(&args[0], &option))
462 return 0; 468 return 0;
463 popt->uid = option; 469 popt->uid = make_kuid(current_user_ns(), option);
470 if (!uid_valid(popt->uid))
471 return 0;
464 popt->uid_set = 1; 472 popt->uid_set = 1;
465 break; 473 break;
466 case Opt_gid: 474 case Opt_gid:
467 if (match_int(&args[0], &option)) 475 if (match_int(&args[0], &option))
468 return 0; 476 return 0;
469 popt->gid = option; 477 popt->gid = make_kgid(current_user_ns(), option);
478 if (!gid_valid(popt->gid))
479 return 0;
470 popt->gid_set = 1; 480 popt->gid_set = 1;
471 break; 481 break;
472 case Opt_mode: 482 case Opt_mode:
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 3620ad1ea9bc..99167238518d 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -52,8 +52,8 @@ struct isofs_sb_info {
52 52
53 umode_t s_fmode; 53 umode_t s_fmode;
54 umode_t s_dmode; 54 umode_t s_dmode;
55 gid_t s_gid; 55 kgid_t s_gid;
56 uid_t s_uid; 56 kuid_t s_uid;
57 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
58}; 58};
59 59
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 70e79d0c756a..c0bf42472e40 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -364,8 +364,8 @@ repeat:
364 case SIG('P', 'X'): 364 case SIG('P', 'X'):
365 inode->i_mode = isonum_733(rr->u.PX.mode); 365 inode->i_mode = isonum_733(rr->u.PX.mode);
366 set_nlink(inode, isonum_733(rr->u.PX.n_links)); 366 set_nlink(inode, isonum_733(rr->u.PX.n_links));
367 inode->i_uid = isonum_733(rr->u.PX.uid); 367 i_uid_write(inode, isonum_733(rr->u.PX.uid));
368 inode->i_gid = isonum_733(rr->u.PX.gid); 368 i_gid_write(inode, isonum_733(rr->u.PX.gid));
369 break; 369 break;
370 case SIG('P', 'N'): 370 case SIG('P', 'N'):
371 { 371 {
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 52c15c776029..86b39b167c23 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -86,7 +86,12 @@ nope:
86static void release_data_buffer(struct buffer_head *bh) 86static void release_data_buffer(struct buffer_head *bh)
87{ 87{
88 if (buffer_freed(bh)) { 88 if (buffer_freed(bh)) {
89 WARN_ON_ONCE(buffer_dirty(bh));
89 clear_buffer_freed(bh); 90 clear_buffer_freed(bh);
91 clear_buffer_mapped(bh);
92 clear_buffer_new(bh);
93 clear_buffer_req(bh);
94 bh->b_bdev = NULL;
90 release_buffer_page(bh); 95 release_buffer_page(bh);
91 } else 96 } else
92 put_bh(bh); 97 put_bh(bh);
@@ -866,17 +871,35 @@ restart_loop:
866 * there's no point in keeping a checkpoint record for 871 * there's no point in keeping a checkpoint record for
867 * it. */ 872 * it. */
868 873
869 /* A buffer which has been freed while still being 874 /*
870 * journaled by a previous transaction may end up still 875 * A buffer which has been freed while still being journaled by
871 * being dirty here, but we want to avoid writing back 876 * a previous transaction.
872 * that buffer in the future after the "add to orphan" 877 */
873 * operation been committed, That's not only a performance 878 if (buffer_freed(bh)) {
874 * gain, it also stops aliasing problems if the buffer is 879 /*
875 * left behind for writeback and gets reallocated for another 880 * If the running transaction is the one containing
876 * use in a different page. */ 881 * "add to orphan" operation (b_next_transaction !=
877 if (buffer_freed(bh) && !jh->b_next_transaction) { 882 * NULL), we have to wait for that transaction to
878 clear_buffer_freed(bh); 883 * commit before we can really get rid of the buffer.
879 clear_buffer_jbddirty(bh); 884 * So just clear b_modified to not confuse transaction
885 * credit accounting and refile the buffer to
886 * BJ_Forget of the running transaction. If the just
887 * committed transaction contains "add to orphan"
888 * operation, we can completely invalidate the buffer
889 * now. We are rather throughout in that since the
890 * buffer may be still accessible when blocksize <
891 * pagesize and it is attached to the last partial
892 * page.
893 */
894 jh->b_modified = 0;
895 if (!jh->b_next_transaction) {
896 clear_buffer_freed(bh);
897 clear_buffer_jbddirty(bh);
898 clear_buffer_mapped(bh);
899 clear_buffer_new(bh);
900 clear_buffer_req(bh);
901 bh->b_bdev = NULL;
902 }
880 } 903 }
881 904
882 if (buffer_jbddirty(bh)) { 905 if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index febc10db5ced..78b7f84241d4 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1843,15 +1843,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1843 * We're outside-transaction here. Either or both of j_running_transaction 1843 * We're outside-transaction here. Either or both of j_running_transaction
1844 * and j_committing_transaction may be NULL. 1844 * and j_committing_transaction may be NULL.
1845 */ 1845 */
1846static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) 1846static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1847 int partial_page)
1847{ 1848{
1848 transaction_t *transaction; 1849 transaction_t *transaction;
1849 struct journal_head *jh; 1850 struct journal_head *jh;
1850 int may_free = 1; 1851 int may_free = 1;
1851 int ret;
1852 1852
1853 BUFFER_TRACE(bh, "entry"); 1853 BUFFER_TRACE(bh, "entry");
1854 1854
1855retry:
1855 /* 1856 /*
1856 * It is safe to proceed here without the j_list_lock because the 1857 * It is safe to proceed here without the j_list_lock because the
1857 * buffers cannot be stolen by try_to_free_buffers as long as we are 1858 * buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1879,10 +1880,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1879 * clear the buffer dirty bit at latest at the moment when the 1880 * clear the buffer dirty bit at latest at the moment when the
1880 * transaction marking the buffer as freed in the filesystem 1881 * transaction marking the buffer as freed in the filesystem
1881 * structures is committed because from that moment on the 1882 * structures is committed because from that moment on the
1882 * buffer can be reallocated and used by a different page. 1883 * block can be reallocated and used by a different page.
1883 * Since the block hasn't been freed yet but the inode has 1884 * Since the block hasn't been freed yet but the inode has
1884 * already been added to orphan list, it is safe for us to add 1885 * already been added to orphan list, it is safe for us to add
1885 * the buffer to BJ_Forget list of the newest transaction. 1886 * the buffer to BJ_Forget list of the newest transaction.
1887 *
1888 * Also we have to clear buffer_mapped flag of a truncated buffer
1889 * because the buffer_head may be attached to the page straddling
1890 * i_size (can happen only when blocksize < pagesize) and thus the
1891 * buffer_head can be reused when the file is extended again. So we end
1892 * up keeping around invalidated buffers attached to transactions'
1893 * BJ_Forget list just to stop checkpointing code from cleaning up
1894 * the transaction this buffer was modified in.
1886 */ 1895 */
1887 transaction = jh->b_transaction; 1896 transaction = jh->b_transaction;
1888 if (transaction == NULL) { 1897 if (transaction == NULL) {
@@ -1909,13 +1918,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1909 * committed, the buffer won't be needed any 1918 * committed, the buffer won't be needed any
1910 * longer. */ 1919 * longer. */
1911 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); 1920 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1912 ret = __dispose_buffer(jh, 1921 may_free = __dispose_buffer(jh,
1913 journal->j_running_transaction); 1922 journal->j_running_transaction);
1914 journal_put_journal_head(jh); 1923 goto zap_buffer;
1915 spin_unlock(&journal->j_list_lock);
1916 jbd_unlock_bh_state(bh);
1917 spin_unlock(&journal->j_state_lock);
1918 return ret;
1919 } else { 1924 } else {
1920 /* There is no currently-running transaction. So the 1925 /* There is no currently-running transaction. So the
1921 * orphan record which we wrote for this file must have 1926 * orphan record which we wrote for this file must have
@@ -1923,13 +1928,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1923 * the committing transaction, if it exists. */ 1928 * the committing transaction, if it exists. */
1924 if (journal->j_committing_transaction) { 1929 if (journal->j_committing_transaction) {
1925 JBUFFER_TRACE(jh, "give to committing trans"); 1930 JBUFFER_TRACE(jh, "give to committing trans");
1926 ret = __dispose_buffer(jh, 1931 may_free = __dispose_buffer(jh,
1927 journal->j_committing_transaction); 1932 journal->j_committing_transaction);
1928 journal_put_journal_head(jh); 1933 goto zap_buffer;
1929 spin_unlock(&journal->j_list_lock);
1930 jbd_unlock_bh_state(bh);
1931 spin_unlock(&journal->j_state_lock);
1932 return ret;
1933 } else { 1934 } else {
1934 /* The orphan record's transaction has 1935 /* The orphan record's transaction has
1935 * committed. We can cleanse this buffer */ 1936 * committed. We can cleanse this buffer */
@@ -1950,10 +1951,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1950 } 1951 }
1951 /* 1952 /*
1952 * The buffer is committing, we simply cannot touch 1953 * The buffer is committing, we simply cannot touch
1953 * it. So we just set j_next_transaction to the 1954 * it. If the page is straddling i_size we have to wait
1954 * running transaction (if there is one) and mark 1955 * for commit and try again.
1955 * buffer as freed so that commit code knows it should 1956 */
1956 * clear dirty bits when it is done with the buffer. 1957 if (partial_page) {
1958 tid_t tid = journal->j_committing_transaction->t_tid;
1959
1960 journal_put_journal_head(jh);
1961 spin_unlock(&journal->j_list_lock);
1962 jbd_unlock_bh_state(bh);
1963 spin_unlock(&journal->j_state_lock);
1964 log_wait_commit(journal, tid);
1965 goto retry;
1966 }
1967 /*
1968 * OK, buffer won't be reachable after truncate. We just set
1969 * j_next_transaction to the running transaction (if there is
1970 * one) and mark buffer as freed so that commit code knows it
1971 * should clear dirty bits when it is done with the buffer.
1957 */ 1972 */
1958 set_buffer_freed(bh); 1973 set_buffer_freed(bh);
1959 if (journal->j_running_transaction && buffer_jbddirty(bh)) 1974 if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1976,6 +1991,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1976 } 1991 }
1977 1992
1978zap_buffer: 1993zap_buffer:
1994 /*
1995 * This is tricky. Although the buffer is truncated, it may be reused
1996 * if blocksize < pagesize and it is attached to the page straddling
1997 * EOF. Since the buffer might have been added to BJ_Forget list of the
1998 * running transaction, journal_get_write_access() won't clear
1999 * b_modified and credit accounting gets confused. So clear b_modified
2000 * here. */
2001 jh->b_modified = 0;
1979 journal_put_journal_head(jh); 2002 journal_put_journal_head(jh);
1980zap_buffer_no_jh: 2003zap_buffer_no_jh:
1981 spin_unlock(&journal->j_list_lock); 2004 spin_unlock(&journal->j_list_lock);
@@ -2024,7 +2047,8 @@ void journal_invalidatepage(journal_t *journal,
2024 if (offset <= curr_off) { 2047 if (offset <= curr_off) {
2025 /* This block is wholly outside the truncation point */ 2048 /* This block is wholly outside the truncation point */
2026 lock_buffer(bh); 2049 lock_buffer(bh);
2027 may_free &= journal_unmap_buffer(journal, bh); 2050 may_free &= journal_unmap_buffer(journal, bh,
2051 offset > 0);
2028 unlock_buffer(bh); 2052 unlock_buffer(bh);
2029 } 2053 }
2030 curr_off = next_off; 2054 curr_off = next_off;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index af5280fb579b..3091d42992f0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1014,17 +1014,35 @@ restart_loop:
1014 * there's no point in keeping a checkpoint record for 1014 * there's no point in keeping a checkpoint record for
1015 * it. */ 1015 * it. */
1016 1016
1017 /* A buffer which has been freed while still being 1017 /*
1018 * journaled by a previous transaction may end up still 1018 * A buffer which has been freed while still being journaled by
1019 * being dirty here, but we want to avoid writing back 1019 * a previous transaction.
1020 * that buffer in the future after the "add to orphan" 1020 */
1021 * operation been committed, That's not only a performance 1021 if (buffer_freed(bh)) {
1022 * gain, it also stops aliasing problems if the buffer is 1022 /*
1023 * left behind for writeback and gets reallocated for another 1023 * If the running transaction is the one containing
1024 * use in a different page. */ 1024 * "add to orphan" operation (b_next_transaction !=
1025 if (buffer_freed(bh) && !jh->b_next_transaction) { 1025 * NULL), we have to wait for that transaction to
1026 clear_buffer_freed(bh); 1026 * commit before we can really get rid of the buffer.
1027 clear_buffer_jbddirty(bh); 1027 * So just clear b_modified to not confuse transaction
1028 * credit accounting and refile the buffer to
1029 * BJ_Forget of the running transaction. If the just
1030 * committed transaction contains "add to orphan"
1031 * operation, we can completely invalidate the buffer
1032 * now. We are rather through in that since the
1033 * buffer may be still accessible when blocksize <
1034 * pagesize and it is attached to the last partial
1035 * page.
1036 */
1037 jh->b_modified = 0;
1038 if (!jh->b_next_transaction) {
1039 clear_buffer_freed(bh);
1040 clear_buffer_jbddirty(bh);
1041 clear_buffer_mapped(bh);
1042 clear_buffer_new(bh);
1043 clear_buffer_req(bh);
1044 bh->b_bdev = NULL;
1045 }
1028 } 1046 }
1029 1047
1030 if (buffer_jbddirty(bh)) { 1048 if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e149b99a7ffb..484b8d1c6cb6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal)
1354 1354
1355 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1355 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1356 read_lock(&journal->j_state_lock); 1356 read_lock(&journal->j_state_lock);
1357 /* Is it already empty? */
1358 if (sb->s_start == 0) {
1359 read_unlock(&journal->j_state_lock);
1360 return;
1361 }
1357 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", 1362 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
1358 journal->j_tail_sequence); 1363 journal->j_tail_sequence);
1359 1364
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 0131e4362534..626846bac32f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal)
289 if (!err) 289 if (!err)
290 err = err2; 290 err = err2;
291 /* Make sure all replayed data is on permanent storage */ 291 /* Make sure all replayed data is on permanent storage */
292 if (journal->j_flags & JBD2_BARRIER) 292 if (journal->j_flags & JBD2_BARRIER) {
293 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 293 err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
294 if (!err)
295 err = err2;
296 }
294 return err; 297 return err;
295} 298}
296 299
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index fb1ab9533b67..a74ba4659549 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1841 * We're outside-transaction here. Either or both of j_running_transaction 1841 * We're outside-transaction here. Either or both of j_running_transaction
1842 * and j_committing_transaction may be NULL. 1842 * and j_committing_transaction may be NULL.
1843 */ 1843 */
1844static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) 1844static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1845 int partial_page)
1845{ 1846{
1846 transaction_t *transaction; 1847 transaction_t *transaction;
1847 struct journal_head *jh; 1848 struct journal_head *jh;
1848 int may_free = 1; 1849 int may_free = 1;
1849 int ret;
1850 1850
1851 BUFFER_TRACE(bh, "entry"); 1851 BUFFER_TRACE(bh, "entry");
1852 1852
1853retry:
1853 /* 1854 /*
1854 * It is safe to proceed here without the j_list_lock because the 1855 * It is safe to proceed here without the j_list_lock because the
1855 * buffers cannot be stolen by try_to_free_buffers as long as we are 1856 * buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1878 * clear the buffer dirty bit at latest at the moment when the 1879 * clear the buffer dirty bit at latest at the moment when the
1879 * transaction marking the buffer as freed in the filesystem 1880 * transaction marking the buffer as freed in the filesystem
1880 * structures is committed because from that moment on the 1881 * structures is committed because from that moment on the
1881 * buffer can be reallocated and used by a different page. 1882 * block can be reallocated and used by a different page.
1882 * Since the block hasn't been freed yet but the inode has 1883 * Since the block hasn't been freed yet but the inode has
1883 * already been added to orphan list, it is safe for us to add 1884 * already been added to orphan list, it is safe for us to add
1884 * the buffer to BJ_Forget list of the newest transaction. 1885 * the buffer to BJ_Forget list of the newest transaction.
1886 *
1887 * Also we have to clear buffer_mapped flag of a truncated buffer
1888 * because the buffer_head may be attached to the page straddling
1889 * i_size (can happen only when blocksize < pagesize) and thus the
1890 * buffer_head can be reused when the file is extended again. So we end
1891 * up keeping around invalidated buffers attached to transactions'
1892 * BJ_Forget list just to stop checkpointing code from cleaning up
1893 * the transaction this buffer was modified in.
1885 */ 1894 */
1886 transaction = jh->b_transaction; 1895 transaction = jh->b_transaction;
1887 if (transaction == NULL) { 1896 if (transaction == NULL) {
@@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1908 * committed, the buffer won't be needed any 1917 * committed, the buffer won't be needed any
1909 * longer. */ 1918 * longer. */
1910 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); 1919 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1911 ret = __dispose_buffer(jh, 1920 may_free = __dispose_buffer(jh,
1912 journal->j_running_transaction); 1921 journal->j_running_transaction);
1913 jbd2_journal_put_journal_head(jh); 1922 goto zap_buffer;
1914 spin_unlock(&journal->j_list_lock);
1915 jbd_unlock_bh_state(bh);
1916 write_unlock(&journal->j_state_lock);
1917 return ret;
1918 } else { 1923 } else {
1919 /* There is no currently-running transaction. So the 1924 /* There is no currently-running transaction. So the
1920 * orphan record which we wrote for this file must have 1925 * orphan record which we wrote for this file must have
@@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1922 * the committing transaction, if it exists. */ 1927 * the committing transaction, if it exists. */
1923 if (journal->j_committing_transaction) { 1928 if (journal->j_committing_transaction) {
1924 JBUFFER_TRACE(jh, "give to committing trans"); 1929 JBUFFER_TRACE(jh, "give to committing trans");
1925 ret = __dispose_buffer(jh, 1930 may_free = __dispose_buffer(jh,
1926 journal->j_committing_transaction); 1931 journal->j_committing_transaction);
1927 jbd2_journal_put_journal_head(jh); 1932 goto zap_buffer;
1928 spin_unlock(&journal->j_list_lock);
1929 jbd_unlock_bh_state(bh);
1930 write_unlock(&journal->j_state_lock);
1931 return ret;
1932 } else { 1933 } else {
1933 /* The orphan record's transaction has 1934 /* The orphan record's transaction has
1934 * committed. We can cleanse this buffer */ 1935 * committed. We can cleanse this buffer */
@@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1940 JBUFFER_TRACE(jh, "on committing transaction"); 1941 JBUFFER_TRACE(jh, "on committing transaction");
1941 /* 1942 /*
1942 * The buffer is committing, we simply cannot touch 1943 * The buffer is committing, we simply cannot touch
1943 * it. So we just set j_next_transaction to the 1944 * it. If the page is straddling i_size we have to wait
1944 * running transaction (if there is one) and mark 1945 * for commit and try again.
1945 * buffer as freed so that commit code knows it should 1946 */
1946 * clear dirty bits when it is done with the buffer. 1947 if (partial_page) {
1948 tid_t tid = journal->j_committing_transaction->t_tid;
1949
1950 jbd2_journal_put_journal_head(jh);
1951 spin_unlock(&journal->j_list_lock);
1952 jbd_unlock_bh_state(bh);
1953 write_unlock(&journal->j_state_lock);
1954 jbd2_log_wait_commit(journal, tid);
1955 goto retry;
1956 }
1957 /*
1958 * OK, buffer won't be reachable after truncate. We just set
1959 * j_next_transaction to the running transaction (if there is
1960 * one) and mark buffer as freed so that commit code knows it
1961 * should clear dirty bits when it is done with the buffer.
1947 */ 1962 */
1948 set_buffer_freed(bh); 1963 set_buffer_freed(bh);
1949 if (journal->j_running_transaction && buffer_jbddirty(bh)) 1964 if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1966 } 1981 }
1967 1982
1968zap_buffer: 1983zap_buffer:
1984 /*
1985 * This is tricky. Although the buffer is truncated, it may be reused
1986 * if blocksize < pagesize and it is attached to the page straddling
1987 * EOF. Since the buffer might have been added to BJ_Forget list of the
1988 * running transaction, journal_get_write_access() won't clear
1989 * b_modified and credit accounting gets confused. So clear b_modified
1990 * here.
1991 */
1992 jh->b_modified = 0;
1969 jbd2_journal_put_journal_head(jh); 1993 jbd2_journal_put_journal_head(jh);
1970zap_buffer_no_jh: 1994zap_buffer_no_jh:
1971 spin_unlock(&journal->j_list_lock); 1995 spin_unlock(&journal->j_list_lock);
@@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal,
2017 if (offset <= curr_off) { 2041 if (offset <= curr_off) {
2018 /* This block is wholly outside the truncation point */ 2042 /* This block is wholly outside the truncation point */
2019 lock_buffer(bh); 2043 lock_buffer(bh);
2020 may_free &= journal_unmap_buffer(journal, bh); 2044 may_free &= journal_unmap_buffer(journal, bh,
2045 offset > 0);
2021 unlock_buffer(bh); 2046 unlock_buffer(bh);
2022 } 2047 }
2023 curr_off = next_off; 2048 curr_off = next_off;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 922f146e4235..223283c30111 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -94,15 +94,23 @@ static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
94 case ACL_MASK: 94 case ACL_MASK:
95 case ACL_OTHER: 95 case ACL_OTHER:
96 value += sizeof(struct jffs2_acl_entry_short); 96 value += sizeof(struct jffs2_acl_entry_short);
97 acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
98 break; 97 break;
99 98
100 case ACL_USER: 99 case ACL_USER:
100 value += sizeof(struct jffs2_acl_entry);
101 if (value > end)
102 goto fail;
103 acl->a_entries[i].e_uid =
104 make_kuid(&init_user_ns,
105 je32_to_cpu(entry->e_id));
106 break;
101 case ACL_GROUP: 107 case ACL_GROUP:
102 value += sizeof(struct jffs2_acl_entry); 108 value += sizeof(struct jffs2_acl_entry);
103 if (value > end) 109 if (value > end)
104 goto fail; 110 goto fail;
105 acl->a_entries[i].e_id = je32_to_cpu(entry->e_id); 111 acl->a_entries[i].e_gid =
112 make_kgid(&init_user_ns,
113 je32_to_cpu(entry->e_id));
106 break; 114 break;
107 115
108 default: 116 default:
@@ -131,13 +139,19 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
131 header->a_version = cpu_to_je32(JFFS2_ACL_VERSION); 139 header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
132 e = header + 1; 140 e = header + 1;
133 for (i=0; i < acl->a_count; i++) { 141 for (i=0; i < acl->a_count; i++) {
142 const struct posix_acl_entry *acl_e = &acl->a_entries[i];
134 entry = e; 143 entry = e;
135 entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag); 144 entry->e_tag = cpu_to_je16(acl_e->e_tag);
136 entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm); 145 entry->e_perm = cpu_to_je16(acl_e->e_perm);
137 switch(acl->a_entries[i].e_tag) { 146 switch(acl_e->e_tag) {
138 case ACL_USER: 147 case ACL_USER:
148 entry->e_id = cpu_to_je32(
149 from_kuid(&init_user_ns, acl_e->e_uid));
150 e += sizeof(struct jffs2_acl_entry);
151 break;
139 case ACL_GROUP: 152 case ACL_GROUP:
140 entry->e_id = cpu_to_je32(acl->a_entries[i].e_id); 153 entry->e_id = cpu_to_je32(
154 from_kgid(&init_user_ns, acl_e->e_gid));
141 e += sizeof(struct jffs2_acl_entry); 155 e += sizeof(struct jffs2_acl_entry);
142 break; 156 break;
143 157
@@ -363,7 +377,7 @@ static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
363 return PTR_ERR(acl); 377 return PTR_ERR(acl);
364 if (!acl) 378 if (!acl)
365 return -ENODATA; 379 return -ENODATA;
366 rc = posix_acl_to_xattr(acl, buffer, size); 380 rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
367 posix_acl_release(acl); 381 posix_acl_release(acl);
368 382
369 return rc; 383 return rc;
@@ -381,7 +395,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
381 return -EPERM; 395 return -EPERM;
382 396
383 if (value) { 397 if (value) {
384 acl = posix_acl_from_xattr(value, size); 398 acl = posix_acl_from_xattr(&init_user_ns, value, size);
385 if (IS_ERR(acl)) 399 if (IS_ERR(acl))
386 return PTR_ERR(acl); 400 return PTR_ERR(acl);
387 if (acl) { 401 if (acl) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index db3889ba8818..60ef3fb707ff 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -175,8 +175,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
175 ri.ino = cpu_to_je32(f->inocache->ino); 175 ri.ino = cpu_to_je32(f->inocache->ino);
176 ri.version = cpu_to_je32(++f->highest_version); 176 ri.version = cpu_to_je32(++f->highest_version);
177 ri.mode = cpu_to_jemode(inode->i_mode); 177 ri.mode = cpu_to_jemode(inode->i_mode);
178 ri.uid = cpu_to_je16(inode->i_uid); 178 ri.uid = cpu_to_je16(i_uid_read(inode));
179 ri.gid = cpu_to_je16(inode->i_gid); 179 ri.gid = cpu_to_je16(i_gid_read(inode));
180 ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); 180 ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs));
181 ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds()); 181 ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds());
182 ri.offset = cpu_to_je32(inode->i_size); 182 ri.offset = cpu_to_je32(inode->i_size);
@@ -283,8 +283,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
283 /* Set the fields that the generic jffs2_write_inode_range() code can't find */ 283 /* Set the fields that the generic jffs2_write_inode_range() code can't find */
284 ri->ino = cpu_to_je32(inode->i_ino); 284 ri->ino = cpu_to_je32(inode->i_ino);
285 ri->mode = cpu_to_jemode(inode->i_mode); 285 ri->mode = cpu_to_jemode(inode->i_mode);
286 ri->uid = cpu_to_je16(inode->i_uid); 286 ri->uid = cpu_to_je16(i_uid_read(inode));
287 ri->gid = cpu_to_je16(inode->i_gid); 287 ri->gid = cpu_to_je16(i_gid_read(inode));
288 ri->isize = cpu_to_je32((uint32_t)inode->i_size); 288 ri->isize = cpu_to_je32((uint32_t)inode->i_size);
289 ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds()); 289 ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds());
290 290
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3d3092eda811..fe3c0527545f 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -99,8 +99,10 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
99 ri->ino = cpu_to_je32(inode->i_ino); 99 ri->ino = cpu_to_je32(inode->i_ino);
100 ri->version = cpu_to_je32(++f->highest_version); 100 ri->version = cpu_to_je32(++f->highest_version);
101 101
102 ri->uid = cpu_to_je16((ivalid & ATTR_UID)?iattr->ia_uid:inode->i_uid); 102 ri->uid = cpu_to_je16((ivalid & ATTR_UID)?
103 ri->gid = cpu_to_je16((ivalid & ATTR_GID)?iattr->ia_gid:inode->i_gid); 103 from_kuid(&init_user_ns, iattr->ia_uid):i_uid_read(inode));
104 ri->gid = cpu_to_je16((ivalid & ATTR_GID)?
105 from_kgid(&init_user_ns, iattr->ia_gid):i_gid_read(inode));
104 106
105 if (ivalid & ATTR_MODE) 107 if (ivalid & ATTR_MODE)
106 ri->mode = cpu_to_jemode(iattr->ia_mode); 108 ri->mode = cpu_to_jemode(iattr->ia_mode);
@@ -147,8 +149,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
147 inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); 149 inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
148 inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); 150 inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
149 inode->i_mode = jemode_to_cpu(ri->mode); 151 inode->i_mode = jemode_to_cpu(ri->mode);
150 inode->i_uid = je16_to_cpu(ri->uid); 152 i_uid_write(inode, je16_to_cpu(ri->uid));
151 inode->i_gid = je16_to_cpu(ri->gid); 153 i_gid_write(inode, je16_to_cpu(ri->gid));
152 154
153 155
154 old_metadata = f->metadata; 156 old_metadata = f->metadata;
@@ -276,8 +278,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
276 return ERR_PTR(ret); 278 return ERR_PTR(ret);
277 } 279 }
278 inode->i_mode = jemode_to_cpu(latest_node.mode); 280 inode->i_mode = jemode_to_cpu(latest_node.mode);
279 inode->i_uid = je16_to_cpu(latest_node.uid); 281 i_uid_write(inode, je16_to_cpu(latest_node.uid));
280 inode->i_gid = je16_to_cpu(latest_node.gid); 282 i_gid_write(inode, je16_to_cpu(latest_node.gid));
281 inode->i_size = je32_to_cpu(latest_node.isize); 283 inode->i_size = je32_to_cpu(latest_node.isize);
282 inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); 284 inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
283 inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); 285 inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
@@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
440 442
441 memset(ri, 0, sizeof(*ri)); 443 memset(ri, 0, sizeof(*ri));
442 /* Set OS-specific defaults for new inodes */ 444 /* Set OS-specific defaults for new inodes */
443 ri->uid = cpu_to_je16(current_fsuid()); 445 ri->uid = cpu_to_je16(from_kuid(&init_user_ns, current_fsuid()));
444 446
445 if (dir_i->i_mode & S_ISGID) { 447 if (dir_i->i_mode & S_ISGID) {
446 ri->gid = cpu_to_je16(dir_i->i_gid); 448 ri->gid = cpu_to_je16(i_gid_read(dir_i));
447 if (S_ISDIR(mode)) 449 if (S_ISDIR(mode))
448 mode |= S_ISGID; 450 mode |= S_ISGID;
449 } else { 451 } else {
450 ri->gid = cpu_to_je16(current_fsgid()); 452 ri->gid = cpu_to_je16(from_kgid(&init_user_ns, current_fsgid()));
451 } 453 }
452 454
453 /* POSIX ACLs have to be processed now, at least partly. 455 /* POSIX ACLs have to be processed now, at least partly.
@@ -467,8 +469,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
467 set_nlink(inode, 1); 469 set_nlink(inode, 1);
468 inode->i_ino = je32_to_cpu(ri->ino); 470 inode->i_ino = je32_to_cpu(ri->ino);
469 inode->i_mode = jemode_to_cpu(ri->mode); 471 inode->i_mode = jemode_to_cpu(ri->mode);
470 inode->i_gid = je16_to_cpu(ri->gid); 472 i_gid_write(inode, je16_to_cpu(ri->gid));
471 inode->i_uid = je16_to_cpu(ri->uid); 473 i_uid_write(inode, je16_to_cpu(ri->uid));
472 inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 474 inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
473 ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); 475 ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
474 476
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index bcd983d7e7f9..d200a9b8fd5e 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -27,8 +27,8 @@ struct kvec;
27 27
28#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size) 28#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size)
29#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode) 29#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
30#define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid) 30#define JFFS2_F_I_UID(f) (i_uid_read(OFNI_EDONI_2SFFJ(f)))
31#define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid) 31#define JFFS2_F_I_GID(f) (i_gid_read(OFNI_EDONI_2SFFJ(f)))
32#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev) 32#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
33 33
34#define ITIME(sec) ((struct timespec){sec, 0}) 34#define ITIME(sec) ((struct timespec){sec, 0})
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1ea349fff68b..ae81b01e6fd7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
394} 394}
395 395
396/* Trivial function to remove the last node in the tree. Which by definition 396/* Trivial function to remove the last node in the tree. Which by definition
397 has no right-hand -- so can be removed just by making its only child (if 397 has no right-hand child — so can be removed just by making its left-hand
398 any) take its place under its parent. */ 398 child (if any) take its place under its parent. Since this is only done
399 when we're consuming the whole tree, there's no need to use rb_erase()
400 and let it worry about adjusting colours and balancing the tree. That
401 would just be a waste of time. */
399static void eat_last(struct rb_root *root, struct rb_node *node) 402static void eat_last(struct rb_root *root, struct rb_node *node)
400{ 403{
401 struct rb_node *parent = rb_parent(node); 404 struct rb_node *parent = rb_parent(node);
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node)
412 link = &parent->rb_right; 415 link = &parent->rb_right;
413 416
414 *link = node->rb_left; 417 *link = node->rb_left;
415 /* Colour doesn't matter now. Only the parent pointer. */
416 if (node->rb_left) 418 if (node->rb_left)
417 node->rb_left->rb_parent_color = node->rb_parent_color; 419 node->rb_left->__rb_parent_color = node->__rb_parent_color;
418} 420}
419 421
420/* We put this in reverse order, so we can just use eat_last */ 422/* We put the version tree in reverse order, so we can use the same eat_last()
423 function that we use to consume the tmpnode tree (tn_root). */
421static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) 424static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
422{ 425{
423 struct rb_node **link = &ver_root->rb_node; 426 struct rb_node **link = &ver_root->rb_node;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 61ea41389f90..d3d8799e2187 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -100,6 +100,10 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
100{ 100{
101 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 101 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
102 102
103#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
104 cancel_delayed_work_sync(&c->wbuf_dwork);
105#endif
106
103 mutex_lock(&c->alloc_sem); 107 mutex_lock(&c->alloc_sem);
104 jffs2_flush_wbuf_pad(c); 108 jffs2_flush_wbuf_pad(c);
105 mutex_unlock(&c->alloc_sem); 109 mutex_unlock(&c->alloc_sem);
@@ -418,6 +422,12 @@ static void __exit exit_jffs2_fs(void)
418 unregister_filesystem(&jffs2_fs_type); 422 unregister_filesystem(&jffs2_fs_type);
419 jffs2_destroy_slab_caches(); 423 jffs2_destroy_slab_caches();
420 jffs2_compressors_exit(); 424 jffs2_compressors_exit();
425
426 /*
427 * Make sure all delayed rcu free inodes are flushed before we
428 * destroy cache.
429 */
430 rcu_barrier();
421 kmem_cache_destroy(jffs2_inode_cachep); 431 kmem_cache_destroy(jffs2_inode_cachep);
422} 432}
423 433
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 6f4529d3697f..a6597d60d76d 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1044,10 +1044,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1044 ops.datbuf = NULL; 1044 ops.datbuf = NULL;
1045 1045
1046 ret = mtd_read_oob(c->mtd, jeb->offset, &ops); 1046 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1047 if (ret || ops.oobretlen != ops.ooblen) { 1047 if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
1048 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", 1048 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
1049 jeb->offset, ops.ooblen, ops.oobretlen, ret); 1049 jeb->offset, ops.ooblen, ops.oobretlen, ret);
1050 if (!ret) 1050 if (!ret || mtd_is_bitflip(ret))
1051 ret = -EIO; 1051 ret = -EIO;
1052 return ret; 1052 return ret;
1053 } 1053 }
@@ -1086,10 +1086,10 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1086 ops.datbuf = NULL; 1086 ops.datbuf = NULL;
1087 1087
1088 ret = mtd_read_oob(c->mtd, jeb->offset, &ops); 1088 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1089 if (ret || ops.oobretlen != ops.ooblen) { 1089 if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
1090 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", 1090 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
1091 jeb->offset, ops.ooblen, ops.oobretlen, ret); 1091 jeb->offset, ops.ooblen, ops.oobretlen, ret);
1092 if (!ret) 1092 if (!ret || mtd_is_bitflip(ret))
1093 ret = -EIO; 1093 ret = -EIO;
1094 return ret; 1094 return ret;
1095 } 1095 }
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index a58fa72d7e59..d20d4737b3ef 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_JFS_FS) += jfs.o
6 6
7jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ 7jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
8 jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ 8 jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
9 jfs_unicode.o jfs_dtree.o jfs_inode.o \ 9 jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \
10 jfs_extent.o symlink.o jfs_metapage.o \ 10 jfs_extent.o symlink.o jfs_metapage.o \
11 jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ 11 jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \
12 resize.o xattr.o ioctl.o 12 resize.o xattr.o ioctl.o
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 45559dc3ea2f..d254d6d35995 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -64,7 +64,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
64 else 64 else
65 acl = ERR_PTR(size); 65 acl = ERR_PTR(size);
66 } else { 66 } else {
67 acl = posix_acl_from_xattr(value, size); 67 acl = posix_acl_from_xattr(&init_user_ns, value, size);
68 } 68 }
69 kfree(value); 69 kfree(value);
70 if (!IS_ERR(acl)) 70 if (!IS_ERR(acl))
@@ -100,7 +100,7 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
100 value = kmalloc(size, GFP_KERNEL); 100 value = kmalloc(size, GFP_KERNEL);
101 if (!value) 101 if (!value)
102 return -ENOMEM; 102 return -ENOMEM;
103 rc = posix_acl_to_xattr(acl, value, size); 103 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
104 if (rc < 0) 104 if (rc < 0)
105 goto out; 105 goto out;
106 } 106 }
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 844f9460cb11..9d3afd157f99 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -108,8 +108,8 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
108 108
109 if (is_quota_modification(inode, iattr)) 109 if (is_quota_modification(inode, iattr))
110 dquot_initialize(inode); 110 dquot_initialize(inode);
111 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 111 if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
112 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 112 (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
113 rc = dquot_transfer(inode, iattr); 113 rc = dquot_transfer(inode, iattr);
114 if (rc) 114 if (rc)
115 return rc; 115 return rc;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index f19d1e04a374..bc555ff417e9 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -11,13 +11,17 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/blkdev.h>
14#include <asm/current.h> 15#include <asm/current.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
18#include "jfs_filsys.h"
19#include "jfs_debug.h"
17#include "jfs_incore.h" 20#include "jfs_incore.h"
18#include "jfs_dinode.h" 21#include "jfs_dinode.h"
19#include "jfs_inode.h" 22#include "jfs_inode.h"
20 23#include "jfs_dmap.h"
24#include "jfs_discard.h"
21 25
22static struct { 26static struct {
23 long jfs_flag; 27 long jfs_flag;
@@ -123,6 +127,40 @@ setflags_out:
123 mnt_drop_write_file(filp); 127 mnt_drop_write_file(filp);
124 return err; 128 return err;
125 } 129 }
130
131 case FITRIM:
132 {
133 struct super_block *sb = inode->i_sb;
134 struct request_queue *q = bdev_get_queue(sb->s_bdev);
135 struct fstrim_range range;
136 s64 ret = 0;
137
138 if (!capable(CAP_SYS_ADMIN))
139 return -EPERM;
140
141 if (!blk_queue_discard(q)) {
142 jfs_warn("FITRIM not supported on device");
143 return -EOPNOTSUPP;
144 }
145
146 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
147 sizeof(range)))
148 return -EFAULT;
149
150 range.minlen = max_t(unsigned int, range.minlen,
151 q->limits.discard_granularity);
152
153 ret = jfs_ioc_trim(inode, &range);
154 if (ret < 0)
155 return ret;
156
157 if (copy_to_user((struct fstrim_range __user *)arg, &range,
158 sizeof(range)))
159 return -EFAULT;
160
161 return 0;
162 }
163
126 default: 164 default:
127 return -ENOTTY; 165 return -ENOTTY;
128 } 166 }
@@ -142,6 +180,9 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
142 case JFS_IOC_SETFLAGS32: 180 case JFS_IOC_SETFLAGS32:
143 cmd = JFS_IOC_SETFLAGS; 181 cmd = JFS_IOC_SETFLAGS;
144 break; 182 break;
183 case FITRIM:
184 cmd = FITRIM;
185 break;
145 } 186 }
146 return jfs_ioctl(filp, cmd, arg); 187 return jfs_ioctl(filp, cmd, arg);
147} 188}
diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
new file mode 100644
index 000000000000..9947563e4175
--- /dev/null
+++ b/fs/jfs/jfs_discard.c
@@ -0,0 +1,117 @@
1/*
2 * Copyright (C) Tino Reichardt, 2012
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22
23#include "jfs_incore.h"
24#include "jfs_superblock.h"
25#include "jfs_discard.h"
26#include "jfs_dmap.h"
27#include "jfs_debug.h"
28
29
30/*
31 * NAME: jfs_issue_discard()
32 *
33 * FUNCTION: TRIM the specified block range on device, if supported
34 *
35 * PARAMETERS:
36 * ip - pointer to in-core inode
37 * blkno - starting block number to be trimmed (0..N)
38 * nblocks - number of blocks to be trimmed
39 *
40 * RETURN VALUES:
41 * none
42 *
43 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
44 */
45void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks)
46{
47 struct super_block *sb = ip->i_sb;
48 int r = 0;
49
50 r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0);
51 if (unlikely(r != 0)) {
52 jfs_err("JFS: sb_issue_discard" \
53 "(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n",
54 sb, (unsigned long long)blkno,
55 (unsigned long long)nblocks, r);
56 }
57
58 jfs_info("JFS: sb_issue_discard" \
59 "(%p, %llu, %llu, GFP_NOFS, 0) = %d\n",
60 sb, (unsigned long long)blkno,
61 (unsigned long long)nblocks, r);
62
63 return;
64}
65
66/*
67 * NAME: jfs_ioc_trim()
68 *
69 * FUNCTION: attempt to discard (TRIM) all free blocks from the
70 * filesystem.
71 *
72 * PARAMETERS:
73 * ip - pointer to in-core inode;
74 * range - the range, given by user space
75 *
76 * RETURN VALUES:
77 * 0 - success
78 * -EIO - i/o error
79 */
80int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
81{
82 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
83 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
84 struct super_block *sb = ipbmap->i_sb;
85 int agno, agno_end;
86 s64 start, end, minlen;
87 u64 trimmed = 0;
88
89 /**
90 * convert byte values to block size of filesystem:
91 * start: First Byte to trim
92 * len: number of Bytes to trim from start
93 * minlen: minimum extent length in Bytes
94 */
95 start = range->start >> sb->s_blocksize_bits;
96 if (start < 0)
97 start = 0;
98 end = start + (range->len >> sb->s_blocksize_bits) - 1;
99 if (end >= bmp->db_mapsize)
100 end = bmp->db_mapsize - 1;
101 minlen = range->minlen >> sb->s_blocksize_bits;
102 if (minlen <= 0)
103 minlen = 1;
104
105 /**
106 * we trim all ag's within the range
107 */
108 agno = BLKTOAG(start, JFS_SBI(ip->i_sb));
109 agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb));
110 while (agno <= agno_end) {
111 trimmed += dbDiscardAG(ip, agno, minlen);
112 agno++;
113 }
114 range->len = trimmed << sb->s_blocksize_bits;
115
116 return 0;
117}
diff --git a/fs/jfs/jfs_discard.h b/fs/jfs/jfs_discard.h
new file mode 100644
index 000000000000..40d1ee6081a0
--- /dev/null
+++ b/fs/jfs/jfs_discard.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) Tino Reichardt, 2012
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_DISCARD
19#define _H_JFS_DISCARD
20
21struct fstrim_range;
22
23extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks);
24extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range);
25
26#endif /* _H_JFS_DISCARD */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9cbd11a3f804..9a55f53be5ff 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004 2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Tino Reichardt, 2012
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -25,6 +26,7 @@
25#include "jfs_lock.h" 26#include "jfs_lock.h"
26#include "jfs_metapage.h" 27#include "jfs_metapage.h"
27#include "jfs_debug.h" 28#include "jfs_debug.h"
29#include "jfs_discard.h"
28 30
29/* 31/*
30 * SERIALIZATION of the Block Allocation Map. 32 * SERIALIZATION of the Block Allocation Map.
@@ -104,7 +106,6 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
104static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, 106static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
105 int nblocks); 107 int nblocks);
106static int dbMaxBud(u8 * cp); 108static int dbMaxBud(u8 * cp);
107s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
108static int blkstol2(s64 nb); 109static int blkstol2(s64 nb);
109 110
110static int cntlz(u32 value); 111static int cntlz(u32 value);
@@ -145,7 +146,6 @@ static const s8 budtab[256] = {
145 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 146 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
146}; 147};
147 148
148
149/* 149/*
150 * NAME: dbMount() 150 * NAME: dbMount()
151 * 151 *
@@ -310,7 +310,6 @@ int dbSync(struct inode *ipbmap)
310 return (0); 310 return (0);
311} 311}
312 312
313
314/* 313/*
315 * NAME: dbFree() 314 * NAME: dbFree()
316 * 315 *
@@ -337,6 +336,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
337 s64 lblkno, rem; 336 s64 lblkno, rem;
338 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 337 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
339 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 338 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
339 struct super_block *sb = ipbmap->i_sb;
340 340
341 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); 341 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
342 342
@@ -351,6 +351,13 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
351 return -EIO; 351 return -EIO;
352 } 352 }
353 353
354 /**
355 * TRIM the blocks, when mounted with discard option
356 */
357 if (JFS_SBI(sb)->flag & JFS_DISCARD)
358 if (JFS_SBI(sb)->minblks_trim <= nblocks)
359 jfs_issue_discard(ipbmap, blkno, nblocks);
360
354 /* 361 /*
355 * free the blocks a dmap at a time. 362 * free the blocks a dmap at a time.
356 */ 363 */
@@ -1095,7 +1102,6 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1095 /* we were not successful */ 1102 /* we were not successful */
1096 release_metapage(mp); 1103 release_metapage(mp);
1097 1104
1098
1099 return (rc); 1105 return (rc);
1100} 1106}
1101 1107
@@ -1590,6 +1596,118 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1590 1596
1591 1597
1592/* 1598/*
1599 * NAME: dbDiscardAG()
1600 *
1601 * FUNCTION: attempt to discard (TRIM) all free blocks of specific AG
1602 *
1603 * algorithm:
1604 * 1) allocate blocks, as large as possible and save them
1605 * while holding IWRITE_LOCK on ipbmap
1606 * 2) trim all these saved block/length values
1607 * 3) mark the blocks free again
1608 *
1609 * benefit:
1610 * - we work only on one ag at some time, minimizing how long we
1611 * need to lock ipbmap
1612 * - reading / writing the fs is possible most time, even on
1613 * trimming
1614 *
1615 * downside:
1616 * - we write two times to the dmapctl and dmap pages
1617 * - but for me, this seems the best way, better ideas?
1618 * /TR 2012
1619 *
1620 * PARAMETERS:
1621 * ip - pointer to in-core inode
1622 * agno - ag to trim
1623 * minlen - minimum value of contiguous blocks
1624 *
1625 * RETURN VALUES:
1626 * s64 - actual number of blocks trimmed
1627 */
1628s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1629{
1630 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
1631 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
1632 s64 nblocks, blkno;
1633 u64 trimmed = 0;
1634 int rc, l2nb;
1635 struct super_block *sb = ipbmap->i_sb;
1636
1637 struct range2trim {
1638 u64 blkno;
1639 u64 nblocks;
1640 } *totrim, *tt;
1641
1642 /* max blkno / nblocks pairs to trim */
1643 int count = 0, range_cnt;
1644 u64 max_ranges;
1645
1646 /* prevent others from writing new stuff here, while trimming */
1647 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
1648
1649 nblocks = bmp->db_agfree[agno];
1650 max_ranges = nblocks;
1651 do_div(max_ranges, minlen);
1652 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
1653 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
1654 if (totrim == NULL) {
1655 jfs_error(bmp->db_ipbmap->i_sb,
1656 "dbDiscardAG: no memory for trim array");
1657 IWRITE_UNLOCK(ipbmap);
1658 return 0;
1659 }
1660
1661 tt = totrim;
1662 while (nblocks >= minlen) {
1663 l2nb = BLKSTOL2(nblocks);
1664
1665 /* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */
1666 rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno);
1667 if (rc == 0) {
1668 tt->blkno = blkno;
1669 tt->nblocks = nblocks;
1670 tt++; count++;
1671
1672 /* the whole ag is free, trim now */
1673 if (bmp->db_agfree[agno] == 0)
1674 break;
1675
1676 /* give a hint for the next while */
1677 nblocks = bmp->db_agfree[agno];
1678 continue;
1679 } else if (rc == -ENOSPC) {
1680 /* search for next smaller log2 block */
1681 l2nb = BLKSTOL2(nblocks) - 1;
1682 nblocks = 1 << l2nb;
1683 } else {
1684 /* Trim any already allocated blocks */
1685 jfs_error(bmp->db_ipbmap->i_sb,
1686 "dbDiscardAG: -EIO");
1687 break;
1688 }
1689
1690 /* check, if our trim array is full */
1691 if (unlikely(count >= range_cnt - 1))
1692 break;
1693 }
1694 IWRITE_UNLOCK(ipbmap);
1695
1696 tt->nblocks = 0; /* mark the current end */
1697 for (tt = totrim; tt->nblocks != 0; tt++) {
1698 /* when mounted with online discard, dbFree() will
1699 * call jfs_issue_discard() itself */
1700 if (!(JFS_SBI(sb)->flag & JFS_DISCARD))
1701 jfs_issue_discard(ip, tt->blkno, tt->nblocks);
1702 dbFree(ip, tt->blkno, tt->nblocks);
1703 trimmed += tt->nblocks;
1704 }
1705 kfree(totrim);
1706
1707 return trimmed;
1708}
1709
1710/*
1593 * NAME: dbFindCtl() 1711 * NAME: dbFindCtl()
1594 * 1712 *
1595 * FUNCTION: starting at a specified dmap control page level and block 1713 * FUNCTION: starting at a specified dmap control page level and block
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 6dcb906c55d8..562b9a7e4311 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -311,4 +311,6 @@ extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
311extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); 311extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
312extern void dbFinalizeBmap(struct inode *ipbmap); 312extern void dbFinalizeBmap(struct inode *ipbmap);
313extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); 313extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
314extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen);
315
314#endif /* _H_JFS_DMAP */ 316#endif /* _H_JFS_DMAP */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index b3f5463fbe52..b67d64671bb4 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -45,6 +45,9 @@
45/* mount time flag to disable journaling to disk */ 45/* mount time flag to disable journaling to disk */
46#define JFS_NOINTEGRITY 0x00000040 46#define JFS_NOINTEGRITY 0x00000040
47 47
48/* mount time flag to enable TRIM to ssd disks */
49#define JFS_DISCARD 0x00000080
50
48/* commit option */ 51/* commit option */
49#define JFS_COMMIT 0x00000f00 /* commit option mask */ 52#define JFS_COMMIT 0x00000f00 /* commit option mask */
50#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ 53#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 1b6f15f191b3..6ba4006e011b 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3078,15 +3078,15 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3078 } 3078 }
3079 set_nlink(ip, le32_to_cpu(dip->di_nlink)); 3079 set_nlink(ip, le32_to_cpu(dip->di_nlink));
3080 3080
3081 jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); 3081 jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid));
3082 if (sbi->uid == -1) 3082 if (!uid_valid(sbi->uid))
3083 ip->i_uid = jfs_ip->saved_uid; 3083 ip->i_uid = jfs_ip->saved_uid;
3084 else { 3084 else {
3085 ip->i_uid = sbi->uid; 3085 ip->i_uid = sbi->uid;
3086 } 3086 }
3087 3087
3088 jfs_ip->saved_gid = le32_to_cpu(dip->di_gid); 3088 jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid));
3089 if (sbi->gid == -1) 3089 if (!gid_valid(sbi->gid))
3090 ip->i_gid = jfs_ip->saved_gid; 3090 ip->i_gid = jfs_ip->saved_gid;
3091 else { 3091 else {
3092 ip->i_gid = sbi->gid; 3092 ip->i_gid = sbi->gid;
@@ -3150,14 +3150,16 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3150 dip->di_size = cpu_to_le64(ip->i_size); 3150 dip->di_size = cpu_to_le64(ip->i_size);
3151 dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); 3151 dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
3152 dip->di_nlink = cpu_to_le32(ip->i_nlink); 3152 dip->di_nlink = cpu_to_le32(ip->i_nlink);
3153 if (sbi->uid == -1) 3153 if (!uid_valid(sbi->uid))
3154 dip->di_uid = cpu_to_le32(ip->i_uid); 3154 dip->di_uid = cpu_to_le32(i_uid_read(ip));
3155 else 3155 else
3156 dip->di_uid = cpu_to_le32(jfs_ip->saved_uid); 3156 dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns,
3157 if (sbi->gid == -1) 3157 jfs_ip->saved_uid));
3158 dip->di_gid = cpu_to_le32(ip->i_gid); 3158 if (!gid_valid(sbi->gid))
3159 dip->di_gid = cpu_to_le32(i_gid_read(ip));
3159 else 3160 else
3160 dip->di_gid = cpu_to_le32(jfs_ip->saved_gid); 3161 dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
3162 jfs_ip->saved_gid));
3161 jfs_get_inode_flags(jfs_ip); 3163 jfs_get_inode_flags(jfs_ip);
3162 /* 3164 /*
3163 * mode2 is only needed for storing the higher order bits. 3165 * mode2 is only needed for storing the higher order bits.
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 584a4a1a6e81..cf47f09e8ac8 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -38,8 +38,8 @@
38struct jfs_inode_info { 38struct jfs_inode_info {
39 int fileset; /* fileset number (always 16)*/ 39 int fileset; /* fileset number (always 16)*/
40 uint mode2; /* jfs-specific mode */ 40 uint mode2; /* jfs-specific mode */
41 uint saved_uid; /* saved for uid mount option */ 41 kuid_t saved_uid; /* saved for uid mount option */
42 uint saved_gid; /* saved for gid mount option */ 42 kgid_t saved_gid; /* saved for gid mount option */
43 pxd_t ixpxd; /* inode extent descriptor */ 43 pxd_t ixpxd; /* inode extent descriptor */
44 dxd_t acl; /* dxd describing acl */ 44 dxd_t acl; /* dxd describing acl */
45 dxd_t ea; /* dxd describing ea */ 45 dxd_t ea; /* dxd describing ea */
@@ -192,9 +192,10 @@ struct jfs_sb_info {
192 uint state; /* mount/recovery state */ 192 uint state; /* mount/recovery state */
193 unsigned long flag; /* mount time flags */ 193 unsigned long flag; /* mount time flags */
194 uint p_state; /* state prior to going no integrity */ 194 uint p_state; /* state prior to going no integrity */
195 uint uid; /* uid to override on-disk uid */ 195 kuid_t uid; /* uid to override on-disk uid */
196 uint gid; /* gid to override on-disk gid */ 196 kgid_t gid; /* gid to override on-disk gid */
197 uint umask; /* umask to override on-disk umask */ 197 uint umask; /* umask to override on-disk umask */
198 uint minblks_trim; /* minimum blocks, for online trim */
198}; 199};
199 200
200/* jfs_sb_info commit_state */ 201/* jfs_sb_info commit_state */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index bb8b661bcc50..5fcc02eaa64c 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2977,12 +2977,9 @@ int jfs_sync(void *arg)
2977 * put back on the anon_list. 2977 * put back on the anon_list.
2978 */ 2978 */
2979 2979
2980 /* Take off anon_list */ 2980 /* Move from anon_list to anon_list2 */
2981 list_del(&jfs_ip->anon_inode_list); 2981 list_move(&jfs_ip->anon_inode_list,
2982 2982 &TxAnchor.anon_list2);
2983 /* Put on anon_list2 */
2984 list_add(&jfs_ip->anon_inode_list,
2985 &TxAnchor.anon_list2);
2986 2983
2987 TXN_UNLOCK(); 2984 TXN_UNLOCK();
2988 iput(ip); 2985 iput(ip);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index c55c7452d285..1a543be09c79 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,6 +33,7 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/blkdev.h>
36 37
37#include "jfs_incore.h" 38#include "jfs_incore.h"
38#include "jfs_filsys.h" 39#include "jfs_filsys.h"
@@ -100,7 +101,7 @@ void jfs_error(struct super_block *sb, const char * function, ...)
100 vsnprintf(error_buf, sizeof(error_buf), function, args); 101 vsnprintf(error_buf, sizeof(error_buf), function, args);
101 va_end(args); 102 va_end(args);
102 103
103 printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf); 104 pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
104 105
105 jfs_handle_error(sb); 106 jfs_handle_error(sb);
106} 107}
@@ -197,7 +198,8 @@ static void jfs_put_super(struct super_block *sb)
197enum { 198enum {
198 Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, 199 Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
199 Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, 200 Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
200 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask 201 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask,
202 Opt_discard, Opt_nodiscard, Opt_discard_minblk
201}; 203};
202 204
203static const match_table_t tokens = { 205static const match_table_t tokens = {
@@ -214,6 +216,9 @@ static const match_table_t tokens = {
214 {Opt_uid, "uid=%u"}, 216 {Opt_uid, "uid=%u"},
215 {Opt_gid, "gid=%u"}, 217 {Opt_gid, "gid=%u"},
216 {Opt_umask, "umask=%u"}, 218 {Opt_umask, "umask=%u"},
219 {Opt_discard, "discard"},
220 {Opt_nodiscard, "nodiscard"},
221 {Opt_discard_minblk, "discard=%u"},
217 {Opt_err, NULL} 222 {Opt_err, NULL}
218}; 223};
219 224
@@ -255,8 +260,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
255 else { 260 else {
256 nls_map = load_nls(args[0].from); 261 nls_map = load_nls(args[0].from);
257 if (!nls_map) { 262 if (!nls_map) {
258 printk(KERN_ERR 263 pr_err("JFS: charset not found\n");
259 "JFS: charset not found\n");
260 goto cleanup; 264 goto cleanup;
261 } 265 }
262 } 266 }
@@ -272,8 +276,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
272 *newLVSize = sb->s_bdev->bd_inode->i_size >> 276 *newLVSize = sb->s_bdev->bd_inode->i_size >>
273 sb->s_blocksize_bits; 277 sb->s_blocksize_bits;
274 if (*newLVSize == 0) 278 if (*newLVSize == 0)
275 printk(KERN_ERR 279 pr_err("JFS: Cannot determine volume size\n");
276 "JFS: Cannot determine volume size\n");
277 break; 280 break;
278 } 281 }
279 case Opt_errors: 282 case Opt_errors:
@@ -294,8 +297,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
294 *flag &= ~JFS_ERR_REMOUNT_RO; 297 *flag &= ~JFS_ERR_REMOUNT_RO;
295 *flag |= JFS_ERR_PANIC; 298 *flag |= JFS_ERR_PANIC;
296 } else { 299 } else {
297 printk(KERN_ERR 300 pr_err("JFS: %s is an invalid error handler\n",
298 "JFS: %s is an invalid error handler\n",
299 errors); 301 errors);
300 goto cleanup; 302 goto cleanup;
301 } 303 }
@@ -314,33 +316,76 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
314 case Opt_usrquota: 316 case Opt_usrquota:
315 case Opt_grpquota: 317 case Opt_grpquota:
316 case Opt_quota: 318 case Opt_quota:
317 printk(KERN_ERR 319 pr_err("JFS: quota operations not supported\n");
318 "JFS: quota operations not supported\n");
319 break; 320 break;
320#endif 321#endif
321 case Opt_uid: 322 case Opt_uid:
322 { 323 {
323 char *uid = args[0].from; 324 char *uid = args[0].from;
324 sbi->uid = simple_strtoul(uid, &uid, 0); 325 uid_t val = simple_strtoul(uid, &uid, 0);
326 sbi->uid = make_kuid(current_user_ns(), val);
327 if (!uid_valid(sbi->uid))
328 goto cleanup;
325 break; 329 break;
326 } 330 }
331
327 case Opt_gid: 332 case Opt_gid:
328 { 333 {
329 char *gid = args[0].from; 334 char *gid = args[0].from;
330 sbi->gid = simple_strtoul(gid, &gid, 0); 335 gid_t val = simple_strtoul(gid, &gid, 0);
336 sbi->gid = make_kgid(current_user_ns(), val);
337 if (!gid_valid(sbi->gid))
338 goto cleanup;
331 break; 339 break;
332 } 340 }
341
333 case Opt_umask: 342 case Opt_umask:
334 { 343 {
335 char *umask = args[0].from; 344 char *umask = args[0].from;
336 sbi->umask = simple_strtoul(umask, &umask, 8); 345 sbi->umask = simple_strtoul(umask, &umask, 8);
337 if (sbi->umask & ~0777) { 346 if (sbi->umask & ~0777) {
338 printk(KERN_ERR 347 pr_err("JFS: Invalid value of umask\n");
339 "JFS: Invalid value of umask\n");
340 goto cleanup; 348 goto cleanup;
341 } 349 }
342 break; 350 break;
343 } 351 }
352
353 case Opt_discard:
354 {
355 struct request_queue *q = bdev_get_queue(sb->s_bdev);
356 /* if set to 1, even copying files will cause
357 * trimming :O
358 * -> user has more control over the online trimming
359 */
360 sbi->minblks_trim = 64;
361 if (blk_queue_discard(q)) {
362 *flag |= JFS_DISCARD;
363 } else {
364 pr_err("JFS: discard option " \
365 "not supported on device\n");
366 }
367 break;
368 }
369
370 case Opt_nodiscard:
371 *flag &= ~JFS_DISCARD;
372 break;
373
374 case Opt_discard_minblk:
375 {
376 struct request_queue *q = bdev_get_queue(sb->s_bdev);
377 char *minblks_trim = args[0].from;
378 if (blk_queue_discard(q)) {
379 *flag |= JFS_DISCARD;
380 sbi->minblks_trim = simple_strtoull(
381 minblks_trim, &minblks_trim, 0);
382 } else {
383 pr_err("JFS: discard option " \
384 "not supported on device\n");
385 }
386 break;
387 }
388
344 default: 389 default:
345 printk("jfs: Unrecognized mount option \"%s\" " 390 printk("jfs: Unrecognized mount option \"%s\" "
346 " or missing value\n", p); 391 " or missing value\n", p);
@@ -374,8 +419,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
374 419
375 if (newLVSize) { 420 if (newLVSize) {
376 if (sb->s_flags & MS_RDONLY) { 421 if (sb->s_flags & MS_RDONLY) {
377 printk(KERN_ERR 422 pr_err("JFS: resize requires volume" \
378 "JFS: resize requires volume to be mounted read-write\n"); 423 " to be mounted read-write\n");
379 return -EROFS; 424 return -EROFS;
380 } 425 }
381 rc = jfs_extendfs(sb, newLVSize, 0); 426 rc = jfs_extendfs(sb, newLVSize, 0);
@@ -443,7 +488,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
443 sb->s_fs_info = sbi; 488 sb->s_fs_info = sbi;
444 sb->s_max_links = JFS_LINK_MAX; 489 sb->s_max_links = JFS_LINK_MAX;
445 sbi->sb = sb; 490 sbi->sb = sb;
446 sbi->uid = sbi->gid = sbi->umask = -1; 491 sbi->uid = INVALID_UID;
492 sbi->gid = INVALID_GID;
493 sbi->umask = -1;
447 494
448 /* initialize the mount flag and determine the default error handler */ 495 /* initialize the mount flag and determine the default error handler */
449 flag = JFS_ERR_REMOUNT_RO; 496 flag = JFS_ERR_REMOUNT_RO;
@@ -457,7 +504,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
457#endif 504#endif
458 505
459 if (newLVSize) { 506 if (newLVSize) {
460 printk(KERN_ERR "resize option for remount only\n"); 507 pr_err("resize option for remount only\n");
461 goto out_kfree; 508 goto out_kfree;
462 } 509 }
463 510
@@ -617,14 +664,16 @@ static int jfs_show_options(struct seq_file *seq, struct dentry *root)
617{ 664{
618 struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); 665 struct jfs_sb_info *sbi = JFS_SBI(root->d_sb);
619 666
620 if (sbi->uid != -1) 667 if (uid_valid(sbi->uid))
621 seq_printf(seq, ",uid=%d", sbi->uid); 668 seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid));
622 if (sbi->gid != -1) 669 if (gid_valid(sbi->gid))
623 seq_printf(seq, ",gid=%d", sbi->gid); 670 seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid));
624 if (sbi->umask != -1) 671 if (sbi->umask != -1)
625 seq_printf(seq, ",umask=%03o", sbi->umask); 672 seq_printf(seq, ",umask=%03o", sbi->umask);
626 if (sbi->flag & JFS_NOINTEGRITY) 673 if (sbi->flag & JFS_NOINTEGRITY)
627 seq_puts(seq, ",nointegrity"); 674 seq_puts(seq, ",nointegrity");
675 if (sbi->flag & JFS_DISCARD)
676 seq_printf(seq, ",discard=%u", sbi->minblks_trim);
628 if (sbi->nls_tab) 677 if (sbi->nls_tab)
629 seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); 678 seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
630 if (sbi->flag & JFS_ERR_CONTINUE) 679 if (sbi->flag & JFS_ERR_CONTINUE)
@@ -903,6 +952,12 @@ static void __exit exit_jfs_fs(void)
903 jfs_proc_clean(); 952 jfs_proc_clean();
904#endif 953#endif
905 unregister_filesystem(&jfs_fs_type); 954 unregister_filesystem(&jfs_fs_type);
955
956 /*
957 * Make sure all delayed rcu free inodes are flushed before we
958 * destroy cache.
959 */
960 rcu_barrier();
906 kmem_cache_destroy(jfs_inode_cachep); 961 kmem_cache_destroy(jfs_inode_cachep);
907} 962}
908 963
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 26683e15b3ac..42d67f9757bf 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -685,7 +685,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
685 * POSIX_ACL_XATTR_ACCESS is tied to i_mode 685 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
686 */ 686 */
687 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { 687 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
688 acl = posix_acl_from_xattr(value, value_len); 688 acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
689 if (IS_ERR(acl)) { 689 if (IS_ERR(acl)) {
690 rc = PTR_ERR(acl); 690 rc = PTR_ERR(acl);
691 printk(KERN_ERR "posix_acl_from_xattr returned %d\n", 691 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
@@ -710,7 +710,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
710 710
711 return 0; 711 return 0;
712 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { 712 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
713 acl = posix_acl_from_xattr(value, value_len); 713 acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
714 if (IS_ERR(acl)) { 714 if (IS_ERR(acl)) {
715 rc = PTR_ERR(acl); 715 rc = PTR_ERR(acl);
716 printk(KERN_ERR "posix_acl_from_xattr returned %d\n", 716 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
diff --git a/fs/libfs.c b/fs/libfs.c
index a74cb1725ac6..7cc37ca19cd8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -874,7 +874,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
874EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 874EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
875 875
876/** 876/**
877 * generic_fh_to_dentry - generic helper for the fh_to_parent export operation 877 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
878 * @sb: filesystem to do the file handle conversion on 878 * @sb: filesystem to do the file handle conversion on
879 * @fid: file handle to convert 879 * @fid: file handle to convert
880 * @fh_len: length of the file handle in bytes 880 * @fh_len: length of the file handle in bytes
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7ef14b3c5bee..e4fb3ba5a58a 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/utsname.h>
11#include <linux/kernel.h> 10#include <linux/kernel.h>
12#include <linux/ktime.h> 11#include <linux/ktime.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
@@ -19,6 +18,8 @@
19 18
20#include <asm/unaligned.h> 19#include <asm/unaligned.h>
21 20
21#include "netns.h"
22
22#define NLMDBG_FACILITY NLMDBG_MONITOR 23#define NLMDBG_FACILITY NLMDBG_MONITOR
23#define NSM_PROGRAM 100024 24#define NSM_PROGRAM 100024
24#define NSM_VERSION 1 25#define NSM_VERSION 1
@@ -40,6 +41,7 @@ struct nsm_args {
40 u32 proc; 41 u32 proc;
41 42
42 char *mon_name; 43 char *mon_name;
44 char *nodename;
43}; 45};
44 46
45struct nsm_res { 47struct nsm_res {
@@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
70 }; 72 };
71 struct rpc_create_args args = { 73 struct rpc_create_args args = {
72 .net = net, 74 .net = net,
73 .protocol = XPRT_TRANSPORT_UDP, 75 .protocol = XPRT_TRANSPORT_TCP,
74 .address = (struct sockaddr *)&sin, 76 .address = (struct sockaddr *)&sin,
75 .addrsize = sizeof(sin), 77 .addrsize = sizeof(sin),
76 .servername = "rpc.statd", 78 .servername = "rpc.statd",
@@ -83,10 +85,54 @@ static struct rpc_clnt *nsm_create(struct net *net)
83 return rpc_create(&args); 85 return rpc_create(&args);
84} 86}
85 87
86static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, 88static struct rpc_clnt *nsm_client_get(struct net *net)
87 struct net *net)
88{ 89{
90 static DEFINE_MUTEX(nsm_create_mutex);
89 struct rpc_clnt *clnt; 91 struct rpc_clnt *clnt;
92 struct lockd_net *ln = net_generic(net, lockd_net_id);
93
94 spin_lock(&ln->nsm_clnt_lock);
95 if (ln->nsm_users) {
96 ln->nsm_users++;
97 clnt = ln->nsm_clnt;
98 spin_unlock(&ln->nsm_clnt_lock);
99 goto out;
100 }
101 spin_unlock(&ln->nsm_clnt_lock);
102
103 mutex_lock(&nsm_create_mutex);
104 clnt = nsm_create(net);
105 if (!IS_ERR(clnt)) {
106 ln->nsm_clnt = clnt;
107 smp_wmb();
108 ln->nsm_users = 1;
109 }
110 mutex_unlock(&nsm_create_mutex);
111out:
112 return clnt;
113}
114
115static void nsm_client_put(struct net *net)
116{
117 struct lockd_net *ln = net_generic(net, lockd_net_id);
118 struct rpc_clnt *clnt = ln->nsm_clnt;
119 int shutdown = 0;
120
121 spin_lock(&ln->nsm_clnt_lock);
122 if (ln->nsm_users) {
123 if (--ln->nsm_users)
124 ln->nsm_clnt = NULL;
125 shutdown = !ln->nsm_users;
126 }
127 spin_unlock(&ln->nsm_clnt_lock);
128
129 if (shutdown)
130 rpc_shutdown_client(clnt);
131}
132
133static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
134 struct rpc_clnt *clnt)
135{
90 int status; 136 int status;
91 struct nsm_args args = { 137 struct nsm_args args = {
92 .priv = &nsm->sm_priv, 138 .priv = &nsm->sm_priv,
@@ -94,31 +140,24 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
94 .vers = 3, 140 .vers = 3,
95 .proc = NLMPROC_NSM_NOTIFY, 141 .proc = NLMPROC_NSM_NOTIFY,
96 .mon_name = nsm->sm_mon_name, 142 .mon_name = nsm->sm_mon_name,
143 .nodename = clnt->cl_nodename,
97 }; 144 };
98 struct rpc_message msg = { 145 struct rpc_message msg = {
99 .rpc_argp = &args, 146 .rpc_argp = &args,
100 .rpc_resp = res, 147 .rpc_resp = res,
101 }; 148 };
102 149
103 clnt = nsm_create(net); 150 BUG_ON(clnt == NULL);
104 if (IS_ERR(clnt)) {
105 status = PTR_ERR(clnt);
106 dprintk("lockd: failed to create NSM upcall transport, "
107 "status=%d\n", status);
108 goto out;
109 }
110 151
111 memset(res, 0, sizeof(*res)); 152 memset(res, 0, sizeof(*res));
112 153
113 msg.rpc_proc = &clnt->cl_procinfo[proc]; 154 msg.rpc_proc = &clnt->cl_procinfo[proc];
114 status = rpc_call_sync(clnt, &msg, 0); 155 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
115 if (status < 0) 156 if (status < 0)
116 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 157 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
117 status); 158 status);
118 else 159 else
119 status = 0; 160 status = 0;
120 rpc_shutdown_client(clnt);
121 out:
122 return status; 161 return status;
123} 162}
124 163
@@ -138,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host)
138 struct nsm_handle *nsm = host->h_nsmhandle; 177 struct nsm_handle *nsm = host->h_nsmhandle;
139 struct nsm_res res; 178 struct nsm_res res;
140 int status; 179 int status;
180 struct rpc_clnt *clnt;
141 181
142 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); 182 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
143 183
@@ -150,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host)
150 */ 190 */
151 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 191 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
152 192
153 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net); 193 clnt = nsm_client_get(host->net);
194 if (IS_ERR(clnt)) {
195 status = PTR_ERR(clnt);
196 dprintk("lockd: failed to create NSM upcall transport, "
197 "status=%d, net=%p\n", status, host->net);
198 return status;
199 }
200
201 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
154 if (unlikely(res.status != 0)) 202 if (unlikely(res.status != 0))
155 status = -EIO; 203 status = -EIO;
156 if (unlikely(status < 0)) { 204 if (unlikely(status < 0)) {
@@ -182,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host)
182 230
183 if (atomic_read(&nsm->sm_count) == 1 231 if (atomic_read(&nsm->sm_count) == 1
184 && nsm->sm_monitored && !nsm->sm_sticky) { 232 && nsm->sm_monitored && !nsm->sm_sticky) {
233 struct lockd_net *ln = net_generic(host->net, lockd_net_id);
234
185 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); 235 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
186 236
187 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net); 237 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
188 if (res.status != 0) 238 if (res.status != 0)
189 status = -EIO; 239 status = -EIO;
190 if (status < 0) 240 if (status < 0)
@@ -192,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host)
192 nsm->sm_name); 242 nsm->sm_name);
193 else 243 else
194 nsm->sm_monitored = 0; 244 nsm->sm_monitored = 0;
245
246 nsm_client_put(host->net);
195 } 247 }
196} 248}
197 249
@@ -430,7 +482,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
430{ 482{
431 __be32 *p; 483 __be32 *p;
432 484
433 encode_nsm_string(xdr, utsname()->nodename); 485 encode_nsm_string(xdr, argp->nodename);
434 p = xdr_reserve_space(xdr, 4 + 4 + 4); 486 p = xdr_reserve_space(xdr, 4 + 4 + 4);
435 *p++ = cpu_to_be32(argp->prog); 487 *p++ = cpu_to_be32(argp->prog);
436 *p++ = cpu_to_be32(argp->vers); 488 *p++ = cpu_to_be32(argp->vers);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 4eee248ba96e..5010b55628b4 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,6 +12,10 @@ struct lockd_net {
12 struct delayed_work grace_period_end; 12 struct delayed_work grace_period_end;
13 struct lock_manager lockd_manager; 13 struct lock_manager lockd_manager;
14 struct list_head grace_list; 14 struct list_head grace_list;
15
16 spinlock_t nsm_clnt_lock;
17 unsigned int nsm_users;
18 struct rpc_clnt *nsm_clnt;
15}; 19};
16 20
17extern int lockd_net_id; 21extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 31a63f87b806..a2aa97d45670 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -126,7 +126,7 @@ static void restart_grace(void)
126static int 126static int
127lockd(void *vrqstp) 127lockd(void *vrqstp)
128{ 128{
129 int err = 0, preverr = 0; 129 int err = 0;
130 struct svc_rqst *rqstp = vrqstp; 130 struct svc_rqst *rqstp = vrqstp;
131 131
132 /* try_to_freeze() is called from svc_recv() */ 132 /* try_to_freeze() is called from svc_recv() */
@@ -165,21 +165,8 @@ lockd(void *vrqstp)
165 * recvfrom routine. 165 * recvfrom routine.
166 */ 166 */
167 err = svc_recv(rqstp, timeout); 167 err = svc_recv(rqstp, timeout);
168 if (err == -EAGAIN || err == -EINTR) { 168 if (err == -EAGAIN || err == -EINTR)
169 preverr = err;
170 continue; 169 continue;
171 }
172 if (err < 0) {
173 if (err != preverr) {
174 printk(KERN_WARNING "%s: unexpected error "
175 "from svc_recv (%d)\n", __func__, err);
176 preverr = err;
177 }
178 schedule_timeout_interruptible(HZ);
179 continue;
180 }
181 preverr = err;
182
183 dprintk("lockd: request from %s\n", 170 dprintk("lockd: request from %s\n",
184 svc_print_addr(rqstp, buf, sizeof(buf))); 171 svc_print_addr(rqstp, buf, sizeof(buf)));
185 172
@@ -596,6 +583,7 @@ static int lockd_init_net(struct net *net)
596 583
597 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); 584 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
598 INIT_LIST_HEAD(&ln->grace_list); 585 INIT_LIST_HEAD(&ln->grace_list);
586 spin_lock_init(&ln->nsm_clnt_lock);
599 return 0; 587 return 0;
600} 588}
601 589
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index fb1a2bedbe97..8d80c990dffd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref)
289 dprintk("lockd: freeing block %p...\n", block); 289 dprintk("lockd: freeing block %p...\n", block);
290 290
291 /* Remove block from file's list of blocks */ 291 /* Remove block from file's list of blocks */
292 mutex_lock(&file->f_mutex);
293 list_del_init(&block->b_flist); 292 list_del_init(&block->b_flist);
294 mutex_unlock(&file->f_mutex); 293 mutex_unlock(&file->f_mutex);
295 294
@@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref)
303static void nlmsvc_release_block(struct nlm_block *block) 302static void nlmsvc_release_block(struct nlm_block *block)
304{ 303{
305 if (block != NULL) 304 if (block != NULL)
306 kref_put(&block->b_count, nlmsvc_free_block); 305 kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
307} 306}
308 307
309/* 308/*
diff --git a/fs/locks.c b/fs/locks.c
index 7e81bfc75164..a94e331a52a2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1289,7 +1289,7 @@ EXPORT_SYMBOL(__break_lease);
1289void lease_get_mtime(struct inode *inode, struct timespec *time) 1289void lease_get_mtime(struct inode *inode, struct timespec *time)
1290{ 1290{
1291 struct file_lock *flock = inode->i_flock; 1291 struct file_lock *flock = inode->i_flock;
1292 if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK)) 1292 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
1293 *time = current_fs_time(inode->i_sb); 1293 *time = current_fs_time(inode->i_sb);
1294 else 1294 else
1295 *time = inode->i_mtime; 1295 *time = inode->i_mtime;
@@ -1625,15 +1625,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
1625 */ 1625 */
1626SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) 1626SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1627{ 1627{
1628 struct file *filp; 1628 struct fd f = fdget(fd);
1629 int fput_needed;
1630 struct file_lock *lock; 1629 struct file_lock *lock;
1631 int can_sleep, unlock; 1630 int can_sleep, unlock;
1632 int error; 1631 int error;
1633 1632
1634 error = -EBADF; 1633 error = -EBADF;
1635 filp = fget_light(fd, &fput_needed); 1634 if (!f.file)
1636 if (!filp)
1637 goto out; 1635 goto out;
1638 1636
1639 can_sleep = !(cmd & LOCK_NB); 1637 can_sleep = !(cmd & LOCK_NB);
@@ -1641,31 +1639,31 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1641 unlock = (cmd == LOCK_UN); 1639 unlock = (cmd == LOCK_UN);
1642 1640
1643 if (!unlock && !(cmd & LOCK_MAND) && 1641 if (!unlock && !(cmd & LOCK_MAND) &&
1644 !(filp->f_mode & (FMODE_READ|FMODE_WRITE))) 1642 !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
1645 goto out_putf; 1643 goto out_putf;
1646 1644
1647 error = flock_make_lock(filp, &lock, cmd); 1645 error = flock_make_lock(f.file, &lock, cmd);
1648 if (error) 1646 if (error)
1649 goto out_putf; 1647 goto out_putf;
1650 if (can_sleep) 1648 if (can_sleep)
1651 lock->fl_flags |= FL_SLEEP; 1649 lock->fl_flags |= FL_SLEEP;
1652 1650
1653 error = security_file_lock(filp, lock->fl_type); 1651 error = security_file_lock(f.file, lock->fl_type);
1654 if (error) 1652 if (error)
1655 goto out_free; 1653 goto out_free;
1656 1654
1657 if (filp->f_op && filp->f_op->flock) 1655 if (f.file->f_op && f.file->f_op->flock)
1658 error = filp->f_op->flock(filp, 1656 error = f.file->f_op->flock(f.file,
1659 (can_sleep) ? F_SETLKW : F_SETLK, 1657 (can_sleep) ? F_SETLKW : F_SETLK,
1660 lock); 1658 lock);
1661 else 1659 else
1662 error = flock_lock_file_wait(filp, lock); 1660 error = flock_lock_file_wait(f.file, lock);
1663 1661
1664 out_free: 1662 out_free:
1665 locks_free_lock(lock); 1663 locks_free_lock(lock);
1666 1664
1667 out_putf: 1665 out_putf:
1668 fput_light(filp, fput_needed); 1666 fdput(f);
1669 out: 1667 out:
1670 return error; 1668 return error;
1671} 1669}
@@ -2187,8 +2185,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2187 } else { 2185 } else {
2188 seq_printf(f, "%s ", 2186 seq_printf(f, "%s ",
2189 (lease_breaking(fl)) 2187 (lease_breaking(fl))
2190 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " 2188 ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
2191 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); 2189 : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
2192 } 2190 }
2193 if (inode) { 2191 if (inode) {
2194#ifdef WE_CAN_BREAK_LSLK_NOW 2192#ifdef WE_CAN_BREAK_LSLK_NOW
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 6984562738d3..adb90116d36b 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -208,8 +208,8 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
208 li->li_height = 0; 208 li->li_height = 0;
209 li->li_used_bytes = 0; 209 li->li_used_bytes = 0;
210 li->li_block = NULL; 210 li->li_block = NULL;
211 inode->i_uid = 0; 211 i_uid_write(inode, 0);
212 inode->i_gid = 0; 212 i_gid_write(inode, 0);
213 inode->i_size = 0; 213 inode->i_size = 0;
214 inode->i_blocks = 0; 214 inode->i_blocks = 0;
215 inode->i_ctime = CURRENT_TIME; 215 inode->i_ctime = CURRENT_TIME;
@@ -417,5 +417,10 @@ int logfs_init_inode_cache(void)
417 417
418void logfs_destroy_inode_cache(void) 418void logfs_destroy_inode_cache(void)
419{ 419{
420 /*
421 * Make sure all delayed rcu free inodes are flushed before we
422 * destroy cache.
423 */
424 rcu_barrier();
420 kmem_cache_destroy(logfs_inode_cache); 425 kmem_cache_destroy(logfs_inode_cache);
421} 426}
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 5be0abef603d..e1a3b6bf6324 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -119,8 +119,8 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
119 inode->i_mode = be16_to_cpu(di->di_mode); 119 inode->i_mode = be16_to_cpu(di->di_mode);
120 li->li_height = di->di_height; 120 li->li_height = di->di_height;
121 li->li_flags = be32_to_cpu(di->di_flags); 121 li->li_flags = be32_to_cpu(di->di_flags);
122 inode->i_uid = be32_to_cpu(di->di_uid); 122 i_uid_write(inode, be32_to_cpu(di->di_uid));
123 inode->i_gid = be32_to_cpu(di->di_gid); 123 i_gid_write(inode, be32_to_cpu(di->di_gid));
124 inode->i_size = be64_to_cpu(di->di_size); 124 inode->i_size = be64_to_cpu(di->di_size);
125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); 125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
126 inode->i_atime = be64_to_timespec(di->di_atime); 126 inode->i_atime = be64_to_timespec(di->di_atime);
@@ -156,8 +156,8 @@ static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
156 di->di_height = li->li_height; 156 di->di_height = li->li_height;
157 di->di_pad = 0; 157 di->di_pad = 0;
158 di->di_flags = cpu_to_be32(li->li_flags); 158 di->di_flags = cpu_to_be32(li->li_flags);
159 di->di_uid = cpu_to_be32(inode->i_uid); 159 di->di_uid = cpu_to_be32(i_uid_read(inode));
160 di->di_gid = cpu_to_be32(inode->i_gid); 160 di->di_gid = cpu_to_be32(i_gid_read(inode));
161 di->di_size = cpu_to_be64(i_size_read(inode)); 161 di->di_size = cpu_to_be64(i_size_read(inode));
162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes); 162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
163 di->di_atime = timespec_to_be64(inode->i_atime); 163 di->di_atime = timespec_to_be64(inode->i_atime);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2a503ad020d5..4fc5f8ab1c44 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -100,6 +100,11 @@ static int init_inodecache(void)
100 100
101static void destroy_inodecache(void) 101static void destroy_inodecache(void)
102{ 102{
103 /*
104 * Make sure all delayed rcu free inodes are flushed before we
105 * destroy cache.
106 */
107 rcu_barrier();
103 kmem_cache_destroy(minix_inode_cachep); 108 kmem_cache_destroy(minix_inode_cachep);
104} 109}
105 110
@@ -460,8 +465,8 @@ static struct inode *V1_minix_iget(struct inode *inode)
460 return ERR_PTR(-EIO); 465 return ERR_PTR(-EIO);
461 } 466 }
462 inode->i_mode = raw_inode->i_mode; 467 inode->i_mode = raw_inode->i_mode;
463 inode->i_uid = (uid_t)raw_inode->i_uid; 468 i_uid_write(inode, raw_inode->i_uid);
464 inode->i_gid = (gid_t)raw_inode->i_gid; 469 i_gid_write(inode, raw_inode->i_gid);
465 set_nlink(inode, raw_inode->i_nlinks); 470 set_nlink(inode, raw_inode->i_nlinks);
466 inode->i_size = raw_inode->i_size; 471 inode->i_size = raw_inode->i_size;
467 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; 472 inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
@@ -493,8 +498,8 @@ static struct inode *V2_minix_iget(struct inode *inode)
493 return ERR_PTR(-EIO); 498 return ERR_PTR(-EIO);
494 } 499 }
495 inode->i_mode = raw_inode->i_mode; 500 inode->i_mode = raw_inode->i_mode;
496 inode->i_uid = (uid_t)raw_inode->i_uid; 501 i_uid_write(inode, raw_inode->i_uid);
497 inode->i_gid = (gid_t)raw_inode->i_gid; 502 i_gid_write(inode, raw_inode->i_gid);
498 set_nlink(inode, raw_inode->i_nlinks); 503 set_nlink(inode, raw_inode->i_nlinks);
499 inode->i_size = raw_inode->i_size; 504 inode->i_size = raw_inode->i_size;
500 inode->i_mtime.tv_sec = raw_inode->i_mtime; 505 inode->i_mtime.tv_sec = raw_inode->i_mtime;
@@ -545,8 +550,8 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode)
545 if (!raw_inode) 550 if (!raw_inode)
546 return NULL; 551 return NULL;
547 raw_inode->i_mode = inode->i_mode; 552 raw_inode->i_mode = inode->i_mode;
548 raw_inode->i_uid = fs_high2lowuid(inode->i_uid); 553 raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
549 raw_inode->i_gid = fs_high2lowgid(inode->i_gid); 554 raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
550 raw_inode->i_nlinks = inode->i_nlink; 555 raw_inode->i_nlinks = inode->i_nlink;
551 raw_inode->i_size = inode->i_size; 556 raw_inode->i_size = inode->i_size;
552 raw_inode->i_time = inode->i_mtime.tv_sec; 557 raw_inode->i_time = inode->i_mtime.tv_sec;
@@ -572,8 +577,8 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
572 if (!raw_inode) 577 if (!raw_inode)
573 return NULL; 578 return NULL;
574 raw_inode->i_mode = inode->i_mode; 579 raw_inode->i_mode = inode->i_mode;
575 raw_inode->i_uid = fs_high2lowuid(inode->i_uid); 580 raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
576 raw_inode->i_gid = fs_high2lowgid(inode->i_gid); 581 raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
577 raw_inode->i_nlinks = inode->i_nlink; 582 raw_inode->i_nlinks = inode->i_nlink;
578 raw_inode->i_size = inode->i_size; 583 raw_inode->i_size = inode->i_size;
579 raw_inode->i_mtime = inode->i_mtime.tv_sec; 584 raw_inode->i_mtime = inode->i_mtime.tv_sec;
diff --git a/fs/namei.c b/fs/namei.c
index dd1ed1b8e98e..d1895f308156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -117,18 +117,70 @@
117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
118 * PATH_MAX includes the nul terminator --RR. 118 * PATH_MAX includes the nul terminator --RR.
119 */ 119 */
120static char *getname_flags(const char __user *filename, int flags, int *empty) 120void final_putname(struct filename *name)
121{ 121{
122 char *result = __getname(), *err; 122 if (name->separate) {
123 __putname(name->name);
124 kfree(name);
125 } else {
126 __putname(name);
127 }
128}
129
130#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
131
132static struct filename *
133getname_flags(const char __user *filename, int flags, int *empty)
134{
135 struct filename *result, *err;
123 int len; 136 int len;
137 long max;
138 char *kname;
139
140 result = audit_reusename(filename);
141 if (result)
142 return result;
124 143
144 result = __getname();
125 if (unlikely(!result)) 145 if (unlikely(!result))
126 return ERR_PTR(-ENOMEM); 146 return ERR_PTR(-ENOMEM);
127 147
128 len = strncpy_from_user(result, filename, PATH_MAX); 148 /*
129 err = ERR_PTR(len); 149 * First, try to embed the struct filename inside the names_cache
130 if (unlikely(len < 0)) 150 * allocation
151 */
152 kname = (char *)result + sizeof(*result);
153 result->name = kname;
154 result->separate = false;
155 max = EMBEDDED_NAME_MAX;
156
157recopy:
158 len = strncpy_from_user(kname, filename, max);
159 if (unlikely(len < 0)) {
160 err = ERR_PTR(len);
131 goto error; 161 goto error;
162 }
163
164 /*
165 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
166 * separate struct filename so we can dedicate the entire
167 * names_cache allocation for the pathname, and re-do the copy from
168 * userland.
169 */
170 if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
171 kname = (char *)result;
172
173 result = kzalloc(sizeof(*result), GFP_KERNEL);
174 if (!result) {
175 err = ERR_PTR(-ENOMEM);
176 result = (struct filename *)kname;
177 goto error;
178 }
179 result->name = kname;
180 result->separate = true;
181 max = PATH_MAX;
182 goto recopy;
183 }
132 184
133 /* The empty path is special. */ 185 /* The empty path is special. */
134 if (unlikely(!len)) { 186 if (unlikely(!len)) {
@@ -140,30 +192,32 @@ static char *getname_flags(const char __user *filename, int flags, int *empty)
140 } 192 }
141 193
142 err = ERR_PTR(-ENAMETOOLONG); 194 err = ERR_PTR(-ENAMETOOLONG);
143 if (likely(len < PATH_MAX)) { 195 if (unlikely(len >= PATH_MAX))
144 audit_getname(result); 196 goto error;
145 return result; 197
146 } 198 result->uptr = filename;
199 audit_getname(result);
200 return result;
147 201
148error: 202error:
149 __putname(result); 203 final_putname(result);
150 return err; 204 return err;
151} 205}
152 206
153char *getname(const char __user * filename) 207struct filename *
208getname(const char __user * filename)
154{ 209{
155 return getname_flags(filename, 0, NULL); 210 return getname_flags(filename, 0, NULL);
156} 211}
212EXPORT_SYMBOL(getname);
157 213
158#ifdef CONFIG_AUDITSYSCALL 214#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 215void putname(struct filename *name)
160{ 216{
161 if (unlikely(!audit_dummy_context())) 217 if (unlikely(!audit_dummy_context()))
162 audit_putname(name); 218 return audit_putname(name);
163 else 219 final_putname(name);
164 __putname(name);
165} 220}
166EXPORT_SYMBOL(putname);
167#endif 221#endif
168 222
169static int check_acl(struct inode *inode, int mask) 223static int check_acl(struct inode *inode, int mask)
@@ -680,7 +734,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
680 734
681 /* Allowed if owner and follower match. */ 735 /* Allowed if owner and follower match. */
682 inode = link->dentry->d_inode; 736 inode = link->dentry->d_inode;
683 if (current_cred()->fsuid == inode->i_uid) 737 if (uid_eq(current_cred()->fsuid, inode->i_uid))
684 return 0; 738 return 0;
685 739
686 /* Allowed if parent directory not sticky and world-writable. */ 740 /* Allowed if parent directory not sticky and world-writable. */
@@ -689,12 +743,12 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
689 return 0; 743 return 0;
690 744
691 /* Allowed if parent directory and link owner match. */ 745 /* Allowed if parent directory and link owner match. */
692 if (parent->i_uid == inode->i_uid) 746 if (uid_eq(parent->i_uid, inode->i_uid))
693 return 0; 747 return 0;
694 748
749 audit_log_link_denied("follow_link", link);
695 path_put_conditional(link, nd); 750 path_put_conditional(link, nd);
696 path_put(&nd->path); 751 path_put(&nd->path);
697 audit_log_link_denied("follow_link", link);
698 return -EACCES; 752 return -EACCES;
699} 753}
700 754
@@ -759,7 +813,7 @@ static int may_linkat(struct path *link)
759 /* Source inode owner (or CAP_FOWNER) can hardlink all they like, 813 /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
760 * otherwise, it must be a safe source. 814 * otherwise, it must be a safe source.
761 */ 815 */
762 if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || 816 if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
763 capable(CAP_FOWNER)) 817 capable(CAP_FOWNER))
764 return 0; 818 return 0;
765 819
@@ -810,6 +864,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
810 return error; 864 return error;
811 865
812out_put_nd_path: 866out_put_nd_path:
867 *p = NULL;
813 path_put(&nd->path); 868 path_put(&nd->path);
814 path_put(link); 869 path_put(link);
815 return error; 870 return error;
@@ -1797,8 +1852,6 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1797 struct nameidata *nd, struct file **fp) 1852 struct nameidata *nd, struct file **fp)
1798{ 1853{
1799 int retval = 0; 1854 int retval = 0;
1800 int fput_needed;
1801 struct file *file;
1802 1855
1803 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1856 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1804 nd->flags = flags | LOOKUP_JUMPED; 1857 nd->flags = flags | LOOKUP_JUMPED;
@@ -1850,44 +1903,41 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1850 get_fs_pwd(current->fs, &nd->path); 1903 get_fs_pwd(current->fs, &nd->path);
1851 } 1904 }
1852 } else { 1905 } else {
1906 struct fd f = fdget_raw(dfd);
1853 struct dentry *dentry; 1907 struct dentry *dentry;
1854 1908
1855 file = fget_raw_light(dfd, &fput_needed); 1909 if (!f.file)
1856 retval = -EBADF; 1910 return -EBADF;
1857 if (!file)
1858 goto out_fail;
1859 1911
1860 dentry = file->f_path.dentry; 1912 dentry = f.file->f_path.dentry;
1861 1913
1862 if (*name) { 1914 if (*name) {
1863 retval = -ENOTDIR; 1915 if (!S_ISDIR(dentry->d_inode->i_mode)) {
1864 if (!S_ISDIR(dentry->d_inode->i_mode)) 1916 fdput(f);
1865 goto fput_fail; 1917 return -ENOTDIR;
1918 }
1866 1919
1867 retval = inode_permission(dentry->d_inode, MAY_EXEC); 1920 retval = inode_permission(dentry->d_inode, MAY_EXEC);
1868 if (retval) 1921 if (retval) {
1869 goto fput_fail; 1922 fdput(f);
1923 return retval;
1924 }
1870 } 1925 }
1871 1926
1872 nd->path = file->f_path; 1927 nd->path = f.file->f_path;
1873 if (flags & LOOKUP_RCU) { 1928 if (flags & LOOKUP_RCU) {
1874 if (fput_needed) 1929 if (f.need_put)
1875 *fp = file; 1930 *fp = f.file;
1876 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1931 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1877 lock_rcu_walk(); 1932 lock_rcu_walk();
1878 } else { 1933 } else {
1879 path_get(&file->f_path); 1934 path_get(&nd->path);
1880 fput_light(file, fput_needed); 1935 fdput(f);
1881 } 1936 }
1882 } 1937 }
1883 1938
1884 nd->inode = nd->path.dentry->d_inode; 1939 nd->inode = nd->path.dentry->d_inode;
1885 return 0; 1940 return 0;
1886
1887fput_fail:
1888 fput_light(file, fput_needed);
1889out_fail:
1890 return retval;
1891} 1941}
1892 1942
1893static inline int lookup_last(struct nameidata *nd, struct path *path) 1943static inline int lookup_last(struct nameidata *nd, struct path *path)
@@ -1967,24 +2017,29 @@ static int path_lookupat(int dfd, const char *name,
1967 return err; 2017 return err;
1968} 2018}
1969 2019
1970static int do_path_lookup(int dfd, const char *name, 2020static int filename_lookup(int dfd, struct filename *name,
1971 unsigned int flags, struct nameidata *nd) 2021 unsigned int flags, struct nameidata *nd)
1972{ 2022{
1973 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); 2023 int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
1974 if (unlikely(retval == -ECHILD)) 2024 if (unlikely(retval == -ECHILD))
1975 retval = path_lookupat(dfd, name, flags, nd); 2025 retval = path_lookupat(dfd, name->name, flags, nd);
1976 if (unlikely(retval == -ESTALE)) 2026 if (unlikely(retval == -ESTALE))
1977 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); 2027 retval = path_lookupat(dfd, name->name,
2028 flags | LOOKUP_REVAL, nd);
1978 2029
1979 if (likely(!retval)) { 2030 if (likely(!retval))
1980 if (unlikely(!audit_dummy_context())) { 2031 audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
1981 if (nd->path.dentry && nd->inode)
1982 audit_inode(name, nd->path.dentry);
1983 }
1984 }
1985 return retval; 2032 return retval;
1986} 2033}
1987 2034
2035static int do_path_lookup(int dfd, const char *name,
2036 unsigned int flags, struct nameidata *nd)
2037{
2038 struct filename filename = { .name = name };
2039
2040 return filename_lookup(dfd, &filename, flags, nd);
2041}
2042
1988/* does lookup, returns the object with parent locked */ 2043/* does lookup, returns the object with parent locked */
1989struct dentry *kern_path_locked(const char *name, struct path *path) 2044struct dentry *kern_path_locked(const char *name, struct path *path)
1990{ 2045{
@@ -2102,13 +2157,13 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2102 struct path *path, int *empty) 2157 struct path *path, int *empty)
2103{ 2158{
2104 struct nameidata nd; 2159 struct nameidata nd;
2105 char *tmp = getname_flags(name, flags, empty); 2160 struct filename *tmp = getname_flags(name, flags, empty);
2106 int err = PTR_ERR(tmp); 2161 int err = PTR_ERR(tmp);
2107 if (!IS_ERR(tmp)) { 2162 if (!IS_ERR(tmp)) {
2108 2163
2109 BUG_ON(flags & LOOKUP_PARENT); 2164 BUG_ON(flags & LOOKUP_PARENT);
2110 2165
2111 err = do_path_lookup(dfd, tmp, flags, &nd); 2166 err = filename_lookup(dfd, tmp, flags, &nd);
2112 putname(tmp); 2167 putname(tmp);
2113 if (!err) 2168 if (!err)
2114 *path = nd.path; 2169 *path = nd.path;
@@ -2122,22 +2177,28 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
2122 return user_path_at_empty(dfd, name, flags, path, NULL); 2177 return user_path_at_empty(dfd, name, flags, path, NULL);
2123} 2178}
2124 2179
2125static int user_path_parent(int dfd, const char __user *path, 2180/*
2126 struct nameidata *nd, char **name) 2181 * NB: most callers don't do anything directly with the reference to the
2182 * to struct filename, but the nd->last pointer points into the name string
2183 * allocated by getname. So we must hold the reference to it until all
2184 * path-walking is complete.
2185 */
2186static struct filename *
2187user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
2127{ 2188{
2128 char *s = getname(path); 2189 struct filename *s = getname(path);
2129 int error; 2190 int error;
2130 2191
2131 if (IS_ERR(s)) 2192 if (IS_ERR(s))
2132 return PTR_ERR(s); 2193 return s;
2133 2194
2134 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 2195 error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
2135 if (error) 2196 if (error) {
2136 putname(s); 2197 putname(s);
2137 else 2198 return ERR_PTR(error);
2138 *name = s; 2199 }
2139 2200
2140 return error; 2201 return s;
2141} 2202}
2142 2203
2143/* 2204/*
@@ -2184,7 +2245,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
2184 return -ENOENT; 2245 return -ENOENT;
2185 2246
2186 BUG_ON(victim->d_parent->d_inode != dir); 2247 BUG_ON(victim->d_parent->d_inode != dir);
2187 audit_inode_child(victim, dir); 2248 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2188 2249
2189 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 2250 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2190 if (error) 2251 if (error)
@@ -2629,7 +2690,7 @@ out_dput:
2629 */ 2690 */
2630static int do_last(struct nameidata *nd, struct path *path, 2691static int do_last(struct nameidata *nd, struct path *path,
2631 struct file *file, const struct open_flags *op, 2692 struct file *file, const struct open_flags *op,
2632 int *opened, const char *pathname) 2693 int *opened, struct filename *name)
2633{ 2694{
2634 struct dentry *dir = nd->path.dentry; 2695 struct dentry *dir = nd->path.dentry;
2635 int open_flag = op->open_flag; 2696 int open_flag = op->open_flag;
@@ -2656,7 +2717,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2656 error = complete_walk(nd); 2717 error = complete_walk(nd);
2657 if (error) 2718 if (error)
2658 return error; 2719 return error;
2659 audit_inode(pathname, nd->path.dentry); 2720 audit_inode(name, nd->path.dentry, 0);
2660 if (open_flag & O_CREAT) { 2721 if (open_flag & O_CREAT) {
2661 error = -EISDIR; 2722 error = -EISDIR;
2662 goto out; 2723 goto out;
@@ -2666,7 +2727,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2666 error = complete_walk(nd); 2727 error = complete_walk(nd);
2667 if (error) 2728 if (error)
2668 return error; 2729 return error;
2669 audit_inode(pathname, dir); 2730 audit_inode(name, dir, 0);
2670 goto finish_open; 2731 goto finish_open;
2671 } 2732 }
2672 2733
@@ -2695,7 +2756,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2695 if (error) 2756 if (error)
2696 return error; 2757 return error;
2697 2758
2698 audit_inode(pathname, dir); 2759 audit_inode(name, dir, 0);
2699 error = -EISDIR; 2760 error = -EISDIR;
2700 /* trailing slashes? */ 2761 /* trailing slashes? */
2701 if (nd->last.name[nd->last.len]) 2762 if (nd->last.name[nd->last.len])
@@ -2725,7 +2786,7 @@ retry_lookup:
2725 !S_ISREG(file->f_path.dentry->d_inode->i_mode)) 2786 !S_ISREG(file->f_path.dentry->d_inode->i_mode))
2726 will_truncate = false; 2787 will_truncate = false;
2727 2788
2728 audit_inode(pathname, file->f_path.dentry); 2789 audit_inode(name, file->f_path.dentry, 0);
2729 goto opened; 2790 goto opened;
2730 } 2791 }
2731 2792
@@ -2742,7 +2803,7 @@ retry_lookup:
2742 * create/update audit record if it already exists. 2803 * create/update audit record if it already exists.
2743 */ 2804 */
2744 if (path->dentry->d_inode) 2805 if (path->dentry->d_inode)
2745 audit_inode(pathname, path->dentry); 2806 audit_inode(name, path->dentry, 0);
2746 2807
2747 /* 2808 /*
2748 * If atomic_open() acquired write access it is dropped now due to 2809 * If atomic_open() acquired write access it is dropped now due to
@@ -2807,7 +2868,7 @@ finish_lookup:
2807 error = -ENOTDIR; 2868 error = -ENOTDIR;
2808 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) 2869 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
2809 goto out; 2870 goto out;
2810 audit_inode(pathname, nd->path.dentry); 2871 audit_inode(name, nd->path.dentry, 0);
2811finish_open: 2872finish_open:
2812 if (!S_ISREG(nd->inode->i_mode)) 2873 if (!S_ISREG(nd->inode->i_mode))
2813 will_truncate = false; 2874 will_truncate = false;
@@ -2875,7 +2936,7 @@ stale_open:
2875 goto retry_lookup; 2936 goto retry_lookup;
2876} 2937}
2877 2938
2878static struct file *path_openat(int dfd, const char *pathname, 2939static struct file *path_openat(int dfd, struct filename *pathname,
2879 struct nameidata *nd, const struct open_flags *op, int flags) 2940 struct nameidata *nd, const struct open_flags *op, int flags)
2880{ 2941{
2881 struct file *base = NULL; 2942 struct file *base = NULL;
@@ -2890,12 +2951,12 @@ static struct file *path_openat(int dfd, const char *pathname,
2890 2951
2891 file->f_flags = op->open_flag; 2952 file->f_flags = op->open_flag;
2892 2953
2893 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); 2954 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2894 if (unlikely(error)) 2955 if (unlikely(error))
2895 goto out; 2956 goto out;
2896 2957
2897 current->total_link_count = 0; 2958 current->total_link_count = 0;
2898 error = link_path_walk(pathname, nd); 2959 error = link_path_walk(pathname->name, nd);
2899 if (unlikely(error)) 2960 if (unlikely(error))
2900 goto out; 2961 goto out;
2901 2962
@@ -2941,7 +3002,7 @@ out:
2941 return file; 3002 return file;
2942} 3003}
2943 3004
2944struct file *do_filp_open(int dfd, const char *pathname, 3005struct file *do_filp_open(int dfd, struct filename *pathname,
2945 const struct open_flags *op, int flags) 3006 const struct open_flags *op, int flags)
2946{ 3007{
2947 struct nameidata nd; 3008 struct nameidata nd;
@@ -2960,6 +3021,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2960{ 3021{
2961 struct nameidata nd; 3022 struct nameidata nd;
2962 struct file *file; 3023 struct file *file;
3024 struct filename filename = { .name = name };
2963 3025
2964 nd.root.mnt = mnt; 3026 nd.root.mnt = mnt;
2965 nd.root.dentry = dentry; 3027 nd.root.dentry = dentry;
@@ -2969,11 +3031,11 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2969 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 3031 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2970 return ERR_PTR(-ELOOP); 3032 return ERR_PTR(-ELOOP);
2971 3033
2972 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); 3034 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
2973 if (unlikely(file == ERR_PTR(-ECHILD))) 3035 if (unlikely(file == ERR_PTR(-ECHILD)))
2974 file = path_openat(-1, name, &nd, op, flags); 3036 file = path_openat(-1, &filename, &nd, op, flags);
2975 if (unlikely(file == ERR_PTR(-ESTALE))) 3037 if (unlikely(file == ERR_PTR(-ESTALE)))
2976 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); 3038 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
2977 return file; 3039 return file;
2978} 3040}
2979 3041
@@ -3048,11 +3110,11 @@ EXPORT_SYMBOL(done_path_create);
3048 3110
3049struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3111struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
3050{ 3112{
3051 char *tmp = getname(pathname); 3113 struct filename *tmp = getname(pathname);
3052 struct dentry *res; 3114 struct dentry *res;
3053 if (IS_ERR(tmp)) 3115 if (IS_ERR(tmp))
3054 return ERR_CAST(tmp); 3116 return ERR_CAST(tmp);
3055 res = kern_path_create(dfd, tmp, path, is_dir); 3117 res = kern_path_create(dfd, tmp->name, path, is_dir);
3056 putname(tmp); 3118 putname(tmp);
3057 return res; 3119 return res;
3058} 3120}
@@ -3257,13 +3319,13 @@ out:
3257static long do_rmdir(int dfd, const char __user *pathname) 3319static long do_rmdir(int dfd, const char __user *pathname)
3258{ 3320{
3259 int error = 0; 3321 int error = 0;
3260 char * name; 3322 struct filename *name;
3261 struct dentry *dentry; 3323 struct dentry *dentry;
3262 struct nameidata nd; 3324 struct nameidata nd;
3263 3325
3264 error = user_path_parent(dfd, pathname, &nd, &name); 3326 name = user_path_parent(dfd, pathname, &nd);
3265 if (error) 3327 if (IS_ERR(name))
3266 return error; 3328 return PTR_ERR(name);
3267 3329
3268 switch(nd.last_type) { 3330 switch(nd.last_type) {
3269 case LAST_DOTDOT: 3331 case LAST_DOTDOT:
@@ -3352,14 +3414,14 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
3352static long do_unlinkat(int dfd, const char __user *pathname) 3414static long do_unlinkat(int dfd, const char __user *pathname)
3353{ 3415{
3354 int error; 3416 int error;
3355 char *name; 3417 struct filename *name;
3356 struct dentry *dentry; 3418 struct dentry *dentry;
3357 struct nameidata nd; 3419 struct nameidata nd;
3358 struct inode *inode = NULL; 3420 struct inode *inode = NULL;
3359 3421
3360 error = user_path_parent(dfd, pathname, &nd, &name); 3422 name = user_path_parent(dfd, pathname, &nd);
3361 if (error) 3423 if (IS_ERR(name))
3362 return error; 3424 return PTR_ERR(name);
3363 3425
3364 error = -EISDIR; 3426 error = -EISDIR;
3365 if (nd.last_type != LAST_NORM) 3427 if (nd.last_type != LAST_NORM)
@@ -3443,7 +3505,7 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3443 int, newdfd, const char __user *, newname) 3505 int, newdfd, const char __user *, newname)
3444{ 3506{
3445 int error; 3507 int error;
3446 char *from; 3508 struct filename *from;
3447 struct dentry *dentry; 3509 struct dentry *dentry;
3448 struct path path; 3510 struct path path;
3449 3511
@@ -3456,9 +3518,9 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3456 if (IS_ERR(dentry)) 3518 if (IS_ERR(dentry))
3457 goto out_putname; 3519 goto out_putname;
3458 3520
3459 error = security_path_symlink(&path, dentry, from); 3521 error = security_path_symlink(&path, dentry, from->name);
3460 if (!error) 3522 if (!error)
3461 error = vfs_symlink(path.dentry->d_inode, dentry, from); 3523 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3462 done_path_create(&path, dentry); 3524 done_path_create(&path, dentry);
3463out_putname: 3525out_putname:
3464 putname(from); 3526 putname(from);
@@ -3738,17 +3800,21 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3738 struct dentry *old_dentry, *new_dentry; 3800 struct dentry *old_dentry, *new_dentry;
3739 struct dentry *trap; 3801 struct dentry *trap;
3740 struct nameidata oldnd, newnd; 3802 struct nameidata oldnd, newnd;
3741 char *from; 3803 struct filename *from;
3742 char *to; 3804 struct filename *to;
3743 int error; 3805 int error;
3744 3806
3745 error = user_path_parent(olddfd, oldname, &oldnd, &from); 3807 from = user_path_parent(olddfd, oldname, &oldnd);
3746 if (error) 3808 if (IS_ERR(from)) {
3809 error = PTR_ERR(from);
3747 goto exit; 3810 goto exit;
3811 }
3748 3812
3749 error = user_path_parent(newdfd, newname, &newnd, &to); 3813 to = user_path_parent(newdfd, newname, &newnd);
3750 if (error) 3814 if (IS_ERR(to)) {
3815 error = PTR_ERR(to);
3751 goto exit1; 3816 goto exit1;
3817 }
3752 3818
3753 error = -EXDEV; 3819 error = -EXDEV;
3754 if (oldnd.path.mnt != newnd.path.mnt) 3820 if (oldnd.path.mnt != newnd.path.mnt)
@@ -3971,8 +4037,7 @@ EXPORT_SYMBOL(user_path_at);
3971EXPORT_SYMBOL(follow_down_one); 4037EXPORT_SYMBOL(follow_down_one);
3972EXPORT_SYMBOL(follow_down); 4038EXPORT_SYMBOL(follow_down);
3973EXPORT_SYMBOL(follow_up); 4039EXPORT_SYMBOL(follow_up);
3974EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 4040EXPORT_SYMBOL(get_write_access); /* nfsd */
3975EXPORT_SYMBOL(getname);
3976EXPORT_SYMBOL(lock_rename); 4041EXPORT_SYMBOL(lock_rename);
3977EXPORT_SYMBOL(lookup_one_len); 4042EXPORT_SYMBOL(lookup_one_len);
3978EXPORT_SYMBOL(page_follow_link_light); 4043EXPORT_SYMBOL(page_follow_link_light);
diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73e2561..24960626bb6b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_change_type(struct path *path, int flag)
1640/* 1640/*
1641 * do loopback mount. 1641 * do loopback mount.
1642 */ 1642 */
1643static int do_loopback(struct path *path, char *old_name, 1643static int do_loopback(struct path *path, const char *old_name,
1644 int recurse) 1644 int recurse)
1645{ 1645{
1646 LIST_HEAD(umount_list); 1646 LIST_HEAD(umount_list);
@@ -1764,7 +1764,7 @@ static inline int tree_contains_unbindable(struct mount *mnt)
1764 return 0; 1764 return 0;
1765} 1765}
1766 1766
1767static int do_move_mount(struct path *path, char *old_name) 1767static int do_move_mount(struct path *path, const char *old_name)
1768{ 1768{
1769 struct path old_path, parent_path; 1769 struct path old_path, parent_path;
1770 struct mount *p; 1770 struct mount *p;
@@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
1886 return err; 1886 return err;
1887 1887
1888 err = -EINVAL; 1888 err = -EINVAL;
1889 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt))) 1889 if (unlikely(!check_mnt(real_mount(path->mnt)))) {
1890 goto unlock; 1890 /* that's acceptable only for automounts done in private ns */
1891 if (!(mnt_flags & MNT_SHRINKABLE))
1892 goto unlock;
1893 /* ... and for those we'd better have mountpoint still alive */
1894 if (!real_mount(path->mnt)->mnt_ns)
1895 goto unlock;
1896 }
1891 1897
1892 /* Refuse the same filesystem on the same mount point */ 1898 /* Refuse the same filesystem on the same mount point */
1893 err = -EBUSY; 1899 err = -EBUSY;
@@ -1911,8 +1917,8 @@ unlock:
1911 * create a new mount for userspace and request it to be added into the 1917 * create a new mount for userspace and request it to be added into the
1912 * namespace's tree 1918 * namespace's tree
1913 */ 1919 */
1914static int do_new_mount(struct path *path, char *type, int flags, 1920static int do_new_mount(struct path *path, const char *type, int flags,
1915 int mnt_flags, char *name, void *data) 1921 int mnt_flags, const char *name, void *data)
1916{ 1922{
1917 struct vfsmount *mnt; 1923 struct vfsmount *mnt;
1918 int err; 1924 int err;
@@ -2185,8 +2191,8 @@ int copy_mount_string(const void __user *data, char **where)
2185 * Therefore, if this magic number is present, it carries no information 2191 * Therefore, if this magic number is present, it carries no information
2186 * and must be discarded. 2192 * and must be discarded.
2187 */ 2193 */
2188long do_mount(char *dev_name, char *dir_name, char *type_page, 2194long do_mount(const char *dev_name, const char *dir_name,
2189 unsigned long flags, void *data_page) 2195 const char *type_page, unsigned long flags, void *data_page)
2190{ 2196{
2191 struct path path; 2197 struct path path;
2192 int retval = 0; 2198 int retval = 0;
@@ -2402,7 +2408,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2402{ 2408{
2403 int ret; 2409 int ret;
2404 char *kernel_type; 2410 char *kernel_type;
2405 char *kernel_dir; 2411 struct filename *kernel_dir;
2406 char *kernel_dev; 2412 char *kernel_dev;
2407 unsigned long data_page; 2413 unsigned long data_page;
2408 2414
@@ -2424,7 +2430,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2424 if (ret < 0) 2430 if (ret < 0)
2425 goto out_data; 2431 goto out_data;
2426 2432
2427 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, 2433 ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
2428 (void *) data_page); 2434 (void *) data_page);
2429 2435
2430 free_page(data_page); 2436 free_page(data_page);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 333df07ae3bd..d7e9fe77188a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -89,6 +89,11 @@ static int init_inodecache(void)
89 89
90static void destroy_inodecache(void) 90static void destroy_inodecache(void)
91{ 91{
92 /*
93 * Make sure all delayed rcu free inodes are flushed before we
94 * destroy cache.
95 */
96 rcu_barrier();
92 kmem_cache_destroy(ncp_inode_cachep); 97 kmem_cache_destroy(ncp_inode_cachep);
93} 98}
94 99
@@ -314,11 +319,11 @@ static void ncp_stop_tasks(struct ncp_server *server) {
314 release_sock(sk); 319 release_sock(sk);
315 del_timer_sync(&server->timeout_tm); 320 del_timer_sync(&server->timeout_tm);
316 321
317 flush_work_sync(&server->rcv.tq); 322 flush_work(&server->rcv.tq);
318 if (sk->sk_socket->type == SOCK_STREAM) 323 if (sk->sk_socket->type == SOCK_STREAM)
319 flush_work_sync(&server->tx.tq); 324 flush_work(&server->tx.tq);
320 else 325 else
321 flush_work_sync(&server->timeout_tq); 326 flush_work(&server->timeout_tq);
322} 327}
323 328
324static int ncp_show_options(struct seq_file *seq, struct dentry *root) 329static int ncp_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index db7ad719628a..13ca196385f5 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,8 @@ config NFS_SWAP
95 This option enables swapon to work on files located on NFS mounts. 95 This option enables swapon to work on files located on NFS mounts.
96 96
97config NFS_V4_1 97config NFS_V4_1
98 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 98 bool "NFS client support for NFSv4.1"
99 depends on NFS_V4 && EXPERIMENTAL 99 depends on NFS_V4
100 select SUNRPC_BACKCHANNEL 100 select SUNRPC_BACKCHANNEL
101 help 101 help
102 This option enables support for minor version 1 of the NFSv4 protocol 102 This option enables support for minor version 1 of the NFSv4 protocol
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dd392ed5f2e2..f1027b06a1a9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -37,6 +37,7 @@
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */ 38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 39#include <linux/prefetch.h>
40#include <linux/pagevec.h>
40 41
41#include "../pnfs.h" 42#include "../pnfs.h"
42#include "../internal.h" 43#include "../internal.h"
@@ -162,25 +163,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
162 return bio; 163 return bio;
163} 164}
164 165
165static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, 166static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
166 sector_t isect, struct page *page, 167 sector_t isect, struct page *page,
167 struct pnfs_block_extent *be, 168 struct pnfs_block_extent *be,
168 void (*end_io)(struct bio *, int err), 169 void (*end_io)(struct bio *, int err),
169 struct parallel_io *par) 170 struct parallel_io *par,
171 unsigned int offset, int len)
170{ 172{
173 isect = isect + (offset >> SECTOR_SHIFT);
174 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
175 npg, rw, (unsigned long long)isect, offset, len);
171retry: 176retry:
172 if (!bio) { 177 if (!bio) {
173 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 178 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
174 if (!bio) 179 if (!bio)
175 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
176 } 181 }
177 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 182 if (bio_add_page(bio, page, len, offset) < len) {
178 bio = bl_submit_bio(rw, bio); 183 bio = bl_submit_bio(rw, bio);
179 goto retry; 184 goto retry;
180 } 185 }
181 return bio; 186 return bio;
182} 187}
183 188
189static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
190 sector_t isect, struct page *page,
191 struct pnfs_block_extent *be,
192 void (*end_io)(struct bio *, int err),
193 struct parallel_io *par)
194{
195 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
196 end_io, par, 0, PAGE_CACHE_SIZE);
197}
198
184/* This is basically copied from mpage_end_io_read */ 199/* This is basically copied from mpage_end_io_read */
185static void bl_end_io_read(struct bio *bio, int err) 200static void bl_end_io_read(struct bio *bio, int err)
186{ 201{
@@ -228,14 +243,6 @@ bl_end_par_io_read(void *data, int unused)
228 schedule_work(&rdata->task.u.tk_work); 243 schedule_work(&rdata->task.u.tk_work);
229} 244}
230 245
231static bool
232bl_check_alignment(u64 offset, u32 len, unsigned long blkmask)
233{
234 if ((offset & blkmask) || (len & blkmask))
235 return false;
236 return true;
237}
238
239static enum pnfs_try_status 246static enum pnfs_try_status
240bl_read_pagelist(struct nfs_read_data *rdata) 247bl_read_pagelist(struct nfs_read_data *rdata)
241{ 248{
@@ -246,15 +253,15 @@ bl_read_pagelist(struct nfs_read_data *rdata)
246 sector_t isect, extent_length = 0; 253 sector_t isect, extent_length = 0;
247 struct parallel_io *par; 254 struct parallel_io *par;
248 loff_t f_offset = rdata->args.offset; 255 loff_t f_offset = rdata->args.offset;
256 size_t bytes_left = rdata->args.count;
257 unsigned int pg_offset, pg_len;
249 struct page **pages = rdata->args.pages; 258 struct page **pages = rdata->args.pages;
250 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 259 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
260 const bool is_dio = (header->dreq != NULL);
251 261
252 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 262 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
253 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); 263 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
254 264
255 if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK))
256 goto use_mds;
257
258 par = alloc_parallel(rdata); 265 par = alloc_parallel(rdata);
259 if (!par) 266 if (!par)
260 goto use_mds; 267 goto use_mds;
@@ -284,36 +291,53 @@ bl_read_pagelist(struct nfs_read_data *rdata)
284 extent_length = min(extent_length, cow_length); 291 extent_length = min(extent_length, cow_length);
285 } 292 }
286 } 293 }
294
295 if (is_dio) {
296 pg_offset = f_offset & ~PAGE_CACHE_MASK;
297 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
298 pg_len = PAGE_CACHE_SIZE - pg_offset;
299 else
300 pg_len = bytes_left;
301
302 f_offset += pg_len;
303 bytes_left -= pg_len;
304 isect += (pg_offset >> SECTOR_SHIFT);
305 } else {
306 pg_offset = 0;
307 pg_len = PAGE_CACHE_SIZE;
308 }
309
287 hole = is_hole(be, isect); 310 hole = is_hole(be, isect);
288 if (hole && !cow_read) { 311 if (hole && !cow_read) {
289 bio = bl_submit_bio(READ, bio); 312 bio = bl_submit_bio(READ, bio);
290 /* Fill hole w/ zeroes w/o accessing device */ 313 /* Fill hole w/ zeroes w/o accessing device */
291 dprintk("%s Zeroing page for hole\n", __func__); 314 dprintk("%s Zeroing page for hole\n", __func__);
292 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); 315 zero_user_segment(pages[i], pg_offset, pg_len);
293 print_page(pages[i]); 316 print_page(pages[i]);
294 SetPageUptodate(pages[i]); 317 SetPageUptodate(pages[i]);
295 } else { 318 } else {
296 struct pnfs_block_extent *be_read; 319 struct pnfs_block_extent *be_read;
297 320
298 be_read = (hole && cow_read) ? cow_read : be; 321 be_read = (hole && cow_read) ? cow_read : be;
299 bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, 322 bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
300 READ, 323 READ,
301 isect, pages[i], be_read, 324 isect, pages[i], be_read,
302 bl_end_io_read, par); 325 bl_end_io_read, par,
326 pg_offset, pg_len);
303 if (IS_ERR(bio)) { 327 if (IS_ERR(bio)) {
304 header->pnfs_error = PTR_ERR(bio); 328 header->pnfs_error = PTR_ERR(bio);
305 bio = NULL; 329 bio = NULL;
306 goto out; 330 goto out;
307 } 331 }
308 } 332 }
309 isect += PAGE_CACHE_SECTORS; 333 isect += (pg_len >> SECTOR_SHIFT);
310 extent_length -= PAGE_CACHE_SECTORS; 334 extent_length -= PAGE_CACHE_SECTORS;
311 } 335 }
312 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 336 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
313 rdata->res.eof = 1; 337 rdata->res.eof = 1;
314 rdata->res.count = header->inode->i_size - f_offset; 338 rdata->res.count = header->inode->i_size - rdata->args.offset;
315 } else { 339 } else {
316 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; 340 rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
317 } 341 }
318out: 342out:
319 bl_put_extent(be); 343 bl_put_extent(be);
@@ -461,6 +485,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
461 return; 485 return;
462} 486}
463 487
488static void
489bl_read_single_end_io(struct bio *bio, int error)
490{
491 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
492 struct page *page = bvec->bv_page;
493
494 /* Only one page in bvec */
495 unlock_page(page);
496}
497
498static int
499bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
500 unsigned int offset, unsigned int len)
501{
502 struct bio *bio;
503 struct page *shadow_page;
504 sector_t isect;
505 char *kaddr, *kshadow_addr;
506 int ret = 0;
507
508 dprintk("%s: offset %u len %u\n", __func__, offset, len);
509
510 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
511 if (shadow_page == NULL)
512 return -ENOMEM;
513
514 bio = bio_alloc(GFP_NOIO, 1);
515 if (bio == NULL)
516 return -ENOMEM;
517
518 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
519 (offset / SECTOR_SIZE);
520
521 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
522 bio->bi_bdev = be->be_mdev;
523 bio->bi_end_io = bl_read_single_end_io;
524
525 lock_page(shadow_page);
526 if (bio_add_page(bio, shadow_page,
527 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
528 unlock_page(shadow_page);
529 bio_put(bio);
530 return -EIO;
531 }
532
533 submit_bio(READ, bio);
534 wait_on_page_locked(shadow_page);
535 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
536 ret = -EIO;
537 } else {
538 kaddr = kmap_atomic(page);
539 kshadow_addr = kmap_atomic(shadow_page);
540 memcpy(kaddr + offset, kshadow_addr + offset, len);
541 kunmap_atomic(kshadow_addr);
542 kunmap_atomic(kaddr);
543 }
544 __free_page(shadow_page);
545 bio_put(bio);
546
547 return ret;
548}
549
550static int
551bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
552 unsigned int dirty_offset, unsigned int dirty_len,
553 bool full_page)
554{
555 int ret = 0;
556 unsigned int start, end;
557
558 if (full_page) {
559 start = 0;
560 end = PAGE_CACHE_SIZE;
561 } else {
562 start = round_down(dirty_offset, SECTOR_SIZE);
563 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
564 }
565
566 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
567 if (!be) {
568 zero_user_segments(page, start, dirty_offset,
569 dirty_offset + dirty_len, end);
570 if (start == 0 && end == PAGE_CACHE_SIZE &&
571 trylock_page(page)) {
572 SetPageUptodate(page);
573 unlock_page(page);
574 }
575 return ret;
576 }
577
578 if (start != dirty_offset)
579 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
580
581 if (!ret && (dirty_offset + dirty_len < end))
582 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
583 end - dirty_offset - dirty_len);
584
585 return ret;
586}
587
464/* Given an unmapped page, zero it or read in page for COW, page is locked 588/* Given an unmapped page, zero it or read in page for COW, page is locked
465 * by caller. 589 * by caller.
466 */ 590 */
@@ -494,7 +618,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
494 SetPageUptodate(page); 618 SetPageUptodate(page);
495 619
496cleanup: 620cleanup:
497 bl_put_extent(cow_read);
498 if (bh) 621 if (bh)
499 free_buffer_head(bh); 622 free_buffer_head(bh);
500 if (ret) { 623 if (ret) {
@@ -566,6 +689,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
566 struct parallel_io *par = NULL; 689 struct parallel_io *par = NULL;
567 loff_t offset = wdata->args.offset; 690 loff_t offset = wdata->args.offset;
568 size_t count = wdata->args.count; 691 size_t count = wdata->args.count;
692 unsigned int pg_offset, pg_len, saved_len;
569 struct page **pages = wdata->args.pages; 693 struct page **pages = wdata->args.pages;
570 struct page *page; 694 struct page *page;
571 pgoff_t index; 695 pgoff_t index;
@@ -574,10 +698,13 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
574 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; 698 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
575 699
576 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 700 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
577 /* Check for alignment first */
578 if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK))
579 goto out_mds;
580 701
702 if (header->dreq != NULL &&
703 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
704 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
705 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
706 goto out_mds;
707 }
581 /* At this point, wdata->pages is a (sequential) list of nfs_pages. 708 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
582 * We want to write each, and if there is an error set pnfs_error 709 * We want to write each, and if there is an error set pnfs_error
583 * to have it redone using nfs. 710 * to have it redone using nfs.
@@ -674,10 +801,11 @@ next_page:
674 if (!extent_length) { 801 if (!extent_length) {
675 /* We've used up the previous extent */ 802 /* We've used up the previous extent */
676 bl_put_extent(be); 803 bl_put_extent(be);
804 bl_put_extent(cow_read);
677 bio = bl_submit_bio(WRITE, bio); 805 bio = bl_submit_bio(WRITE, bio);
678 /* Get the next one */ 806 /* Get the next one */
679 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 807 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
680 isect, NULL); 808 isect, &cow_read);
681 if (!be || !is_writable(be, isect)) { 809 if (!be || !is_writable(be, isect)) {
682 header->pnfs_error = -EINVAL; 810 header->pnfs_error = -EINVAL;
683 goto out; 811 goto out;
@@ -694,7 +822,26 @@ next_page:
694 extent_length = be->be_length - 822 extent_length = be->be_length -
695 (isect - be->be_f_offset); 823 (isect - be->be_f_offset);
696 } 824 }
697 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 825
826 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
827 pg_offset = offset & ~PAGE_CACHE_MASK;
828 if (pg_offset + count > PAGE_CACHE_SIZE)
829 pg_len = PAGE_CACHE_SIZE - pg_offset;
830 else
831 pg_len = count;
832
833 saved_len = pg_len;
834 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
835 !bl_is_sector_init(be->be_inval, isect)) {
836 ret = bl_read_partial_page_sync(pages[i], cow_read,
837 pg_offset, pg_len, true);
838 if (ret) {
839 dprintk("%s bl_read_partial_page_sync fail %d\n",
840 __func__, ret);
841 header->pnfs_error = ret;
842 goto out;
843 }
844
698 ret = bl_mark_sectors_init(be->be_inval, isect, 845 ret = bl_mark_sectors_init(be->be_inval, isect,
699 PAGE_CACHE_SECTORS); 846 PAGE_CACHE_SECTORS);
700 if (unlikely(ret)) { 847 if (unlikely(ret)) {
@@ -703,15 +850,35 @@ next_page:
703 header->pnfs_error = ret; 850 header->pnfs_error = ret;
704 goto out; 851 goto out;
705 } 852 }
853
854 /* Expand to full page write */
855 pg_offset = 0;
856 pg_len = PAGE_CACHE_SIZE;
857 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
858 (pg_len & (SECTOR_SIZE - 1))){
859 /* ahh, nasty case. We have to do sync full sector
860 * read-modify-write cycles.
861 */
862 unsigned int saved_offset = pg_offset;
863 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
864 pg_len, false);
865 pg_offset = round_down(pg_offset, SECTOR_SIZE);
866 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
867 - pg_offset;
706 } 868 }
707 bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 869
870
871 bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
708 isect, pages[i], be, 872 isect, pages[i], be,
709 bl_end_io_write, par); 873 bl_end_io_write, par,
874 pg_offset, pg_len);
710 if (IS_ERR(bio)) { 875 if (IS_ERR(bio)) {
711 header->pnfs_error = PTR_ERR(bio); 876 header->pnfs_error = PTR_ERR(bio);
712 bio = NULL; 877 bio = NULL;
713 goto out; 878 goto out;
714 } 879 }
880 offset += saved_len;
881 count -= saved_len;
715 isect += PAGE_CACHE_SECTORS; 882 isect += PAGE_CACHE_SECTORS;
716 last_isect = isect; 883 last_isect = isect;
717 extent_length -= PAGE_CACHE_SECTORS; 884 extent_length -= PAGE_CACHE_SECTORS;
@@ -729,17 +896,16 @@ next_page:
729 } 896 }
730 897
731write_done: 898write_done:
732 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); 899 wdata->res.count = wdata->args.count;
733 if (count < wdata->res.count) {
734 wdata->res.count = count;
735 }
736out: 900out:
737 bl_put_extent(be); 901 bl_put_extent(be);
902 bl_put_extent(cow_read);
738 bl_submit_bio(WRITE, bio); 903 bl_submit_bio(WRITE, bio);
739 put_parallel(par); 904 put_parallel(par);
740 return PNFS_ATTEMPTED; 905 return PNFS_ATTEMPTED;
741out_mds: 906out_mds:
742 bl_put_extent(be); 907 bl_put_extent(be);
908 bl_put_extent(cow_read);
743 kfree(par); 909 kfree(par);
744 return PNFS_NOT_ATTEMPTED; 910 return PNFS_NOT_ATTEMPTED;
745} 911}
@@ -874,7 +1040,7 @@ static void free_blk_mountid(struct block_mount_id *mid)
874 } 1040 }
875} 1041}
876 1042
877/* This is mostly copied from the filelayout's get_device_info function. 1043/* This is mostly copied from the filelayout_get_device_info function.
878 * It seems much of this should be at the generic pnfs level. 1044 * It seems much of this should be at the generic pnfs level.
879 */ 1045 */
880static struct pnfs_block_dev * 1046static struct pnfs_block_dev *
@@ -1011,33 +1177,95 @@ bl_clear_layoutdriver(struct nfs_server *server)
1011 return 0; 1177 return 0;
1012} 1178}
1013 1179
1180static bool
1181is_aligned_req(struct nfs_page *req, unsigned int alignment)
1182{
1183 return IS_ALIGNED(req->wb_offset, alignment) &&
1184 IS_ALIGNED(req->wb_bytes, alignment);
1185}
1186
1014static void 1187static void
1015bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1188bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1016{ 1189{
1017 if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) 1190 if (pgio->pg_dreq != NULL &&
1191 !is_aligned_req(req, SECTOR_SIZE))
1018 nfs_pageio_reset_read_mds(pgio); 1192 nfs_pageio_reset_read_mds(pgio);
1019 else 1193 else
1020 pnfs_generic_pg_init_read(pgio, req); 1194 pnfs_generic_pg_init_read(pgio, req);
1021} 1195}
1022 1196
1197static bool
1198bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1199 struct nfs_page *req)
1200{
1201 if (pgio->pg_dreq != NULL &&
1202 !is_aligned_req(req, SECTOR_SIZE))
1203 return false;
1204
1205 return pnfs_generic_pg_test(pgio, prev, req);
1206}
1207
1208/*
1209 * Return the number of contiguous bytes for a given inode
1210 * starting at page frame idx.
1211 */
1212static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1213{
1214 struct address_space *mapping = inode->i_mapping;
1215 pgoff_t end;
1216
1217 /* Optimize common case that writes from 0 to end of file */
1218 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1219 if (end != NFS_I(inode)->npages) {
1220 rcu_read_lock();
1221 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
1222 rcu_read_unlock();
1223 }
1224
1225 if (!end)
1226 return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
1227 else
1228 return (end - idx) << PAGE_CACHE_SHIFT;
1229}
1230
1023static void 1231static void
1024bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1232bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1025{ 1233{
1026 if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) 1234 if (pgio->pg_dreq != NULL &&
1235 !is_aligned_req(req, PAGE_CACHE_SIZE)) {
1027 nfs_pageio_reset_write_mds(pgio); 1236 nfs_pageio_reset_write_mds(pgio);
1028 else 1237 } else {
1029 pnfs_generic_pg_init_write(pgio, req); 1238 u64 wb_size;
1239 if (pgio->pg_dreq == NULL)
1240 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1241 req->wb_index);
1242 else
1243 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1244
1245 pnfs_generic_pg_init_write(pgio, req, wb_size);
1246 }
1247}
1248
1249static bool
1250bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1251 struct nfs_page *req)
1252{
1253 if (pgio->pg_dreq != NULL &&
1254 !is_aligned_req(req, PAGE_CACHE_SIZE))
1255 return false;
1256
1257 return pnfs_generic_pg_test(pgio, prev, req);
1030} 1258}
1031 1259
1032static const struct nfs_pageio_ops bl_pg_read_ops = { 1260static const struct nfs_pageio_ops bl_pg_read_ops = {
1033 .pg_init = bl_pg_init_read, 1261 .pg_init = bl_pg_init_read,
1034 .pg_test = pnfs_generic_pg_test, 1262 .pg_test = bl_pg_test_read,
1035 .pg_doio = pnfs_generic_pg_readpages, 1263 .pg_doio = pnfs_generic_pg_readpages,
1036}; 1264};
1037 1265
1038static const struct nfs_pageio_ops bl_pg_write_ops = { 1266static const struct nfs_pageio_ops bl_pg_write_ops = {
1039 .pg_init = bl_pg_init_write, 1267 .pg_init = bl_pg_init_write,
1040 .pg_test = pnfs_generic_pg_test, 1268 .pg_test = bl_pg_test_write,
1041 .pg_doio = pnfs_generic_pg_writepages, 1269 .pg_doio = pnfs_generic_pg_writepages,
1042}; 1270};
1043 1271
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 03350690118e..f4891bde8851 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
41 41
42#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) 42#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
43#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 43#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
44#define SECTOR_SIZE (1 << SECTOR_SHIFT)
44 45
45struct block_mount_id { 46struct block_mount_id {
46 spinlock_t bm_lock; /* protects list */ 47 spinlock_t bm_lock; /* protects list */
@@ -172,7 +173,6 @@ struct bl_msg_hdr {
172/* blocklayoutdev.c */ 173/* blocklayoutdev.c */
173ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
174void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 175void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
175struct block_device *nfs4_blkdev_get(dev_t dev);
176int nfs4_blkdev_put(struct block_device *bdev); 176int nfs4_blkdev_put(struct block_device *bdev);
177struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 177struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
178 struct pnfs_device *dev); 178 struct pnfs_device *dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index c96554245ccf..a86c5bdad9e3 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -53,22 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
53 return 0; 53 return 0;
54} 54}
55 55
56/* Open a block_device by device number. */
57struct block_device *nfs4_blkdev_get(dev_t dev)
58{
59 struct block_device *bd;
60
61 dprintk("%s enter\n", __func__);
62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
63 if (IS_ERR(bd))
64 goto fail;
65 return bd;
66fail:
67 dprintk("%s failed to open device : %ld\n",
68 __func__, PTR_ERR(bd));
69 return NULL;
70}
71
72/* 56/*
73 * Release the block device 57 * Release the block device
74 */ 58 */
@@ -172,11 +156,12 @@ nfs4_blk_decode_device(struct nfs_server *server,
172 goto out; 156 goto out;
173 } 157 }
174 158
175 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
176 if (IS_ERR(bd)) { 161 if (IS_ERR(bd)) {
177 rc = PTR_ERR(bd); 162 dprintk("%s failed to open device : %ld\n", __func__,
178 dprintk("%s failed to open device : %d\n", __func__, rc); 163 PTR_ERR(bd));
179 rv = ERR_PTR(rc); 164 rv = ERR_CAST(bd);
180 goto out; 165 goto out;
181 } 166 }
182 167
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 1f9a6032796b..9c3e117c3ed1 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -683,8 +683,7 @@ encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); 683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL); 684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_del(&lce->bse_node); 686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 list_add_tail(&lce->bse_node, &bl->bl_committing);
688 bl->bl_count--; 687 bl->bl_count--;
689 count++; 688 count++;
690 } 689 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 4c8459e5bdee..9a521fb39869 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -12,6 +12,7 @@
12#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
13#include <linux/sunrpc/svcsock.h> 13#include <linux/sunrpc/svcsock.h>
14#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
15#include <linux/errno.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <linux/freezer.h> 17#include <linux/freezer.h>
17#include <linux/kthread.h> 18#include <linux/kthread.h>
@@ -23,6 +24,7 @@
23#include "nfs4_fs.h" 24#include "nfs4_fs.h"
24#include "callback.h" 25#include "callback.h"
25#include "internal.h" 26#include "internal.h"
27#include "netns.h"
26 28
27#define NFSDBG_FACILITY NFSDBG_CALLBACK 29#define NFSDBG_FACILITY NFSDBG_CALLBACK
28 30
@@ -37,7 +39,32 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
37static DEFINE_MUTEX(nfs_callback_mutex); 39static DEFINE_MUTEX(nfs_callback_mutex);
38static struct svc_program nfs4_callback_program; 40static struct svc_program nfs4_callback_program;
39 41
40unsigned short nfs_callback_tcpport6; 42static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
43{
44 int ret;
45 struct nfs_net *nn = net_generic(net, nfs_net_id);
46
47 ret = svc_create_xprt(serv, "tcp", net, PF_INET,
48 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
49 if (ret <= 0)
50 goto out_err;
51 nn->nfs_callback_tcpport = ret;
52 dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
53 nn->nfs_callback_tcpport, PF_INET, net);
54
55 ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
56 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
57 if (ret > 0) {
58 nn->nfs_callback_tcpport6 = ret;
59 dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
60 nn->nfs_callback_tcpport6, PF_INET6, net);
61 } else if (ret != -EAFNOSUPPORT)
62 goto out_err;
63 return 0;
64
65out_err:
66 return (ret) ? ret : -ENOMEM;
67}
41 68
42/* 69/*
43 * This is the NFSv4 callback kernel thread. 70 * This is the NFSv4 callback kernel thread.
@@ -45,7 +72,7 @@ unsigned short nfs_callback_tcpport6;
45static int 72static int
46nfs4_callback_svc(void *vrqstp) 73nfs4_callback_svc(void *vrqstp)
47{ 74{
48 int err, preverr = 0; 75 int err;
49 struct svc_rqst *rqstp = vrqstp; 76 struct svc_rqst *rqstp = vrqstp;
50 77
51 set_freezable(); 78 set_freezable();
@@ -55,20 +82,8 @@ nfs4_callback_svc(void *vrqstp)
55 * Listen for a request on the socket 82 * Listen for a request on the socket
56 */ 83 */
57 err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); 84 err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
58 if (err == -EAGAIN || err == -EINTR) { 85 if (err == -EAGAIN || err == -EINTR)
59 preverr = err;
60 continue;
61 }
62 if (err < 0) {
63 if (err != preverr) {
64 printk(KERN_WARNING "NFS: %s: unexpected error "
65 "from svc_recv (%d)\n", __func__, err);
66 preverr = err;
67 }
68 schedule_timeout_uninterruptible(HZ);
69 continue; 86 continue;
70 }
71 preverr = err;
72 svc_process(rqstp); 87 svc_process(rqstp);
73 } 88 }
74 return 0; 89 return 0;
@@ -78,38 +93,23 @@ nfs4_callback_svc(void *vrqstp)
78 * Prepare to bring up the NFSv4 callback service 93 * Prepare to bring up the NFSv4 callback service
79 */ 94 */
80static struct svc_rqst * 95static struct svc_rqst *
81nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 96nfs4_callback_up(struct svc_serv *serv)
82{ 97{
83 int ret;
84
85 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
86 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
87 if (ret <= 0)
88 goto out_err;
89 nfs_callback_tcpport = ret;
90 dprintk("NFS: Callback listener port = %u (af %u)\n",
91 nfs_callback_tcpport, PF_INET);
92
93 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
94 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
95 if (ret > 0) {
96 nfs_callback_tcpport6 = ret;
97 dprintk("NFS: Callback listener port = %u (af %u)\n",
98 nfs_callback_tcpport6, PF_INET6);
99 } else if (ret == -EAFNOSUPPORT)
100 ret = 0;
101 else
102 goto out_err;
103
104 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); 98 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
105
106out_err:
107 if (ret == 0)
108 ret = -ENOMEM;
109 return ERR_PTR(ret);
110} 99}
111 100
112#if defined(CONFIG_NFS_V4_1) 101#if defined(CONFIG_NFS_V4_1)
102static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
103{
104 /*
105 * Create an svc_sock for the back channel service that shares the
106 * fore channel connection.
107 * Returns the input port (0) and sets the svc_serv bc_xprt on success
108 */
109 return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
110 SVC_SOCK_ANONYMOUS);
111}
112
113/* 113/*
114 * The callback service for NFSv4.1 callbacks 114 * The callback service for NFSv4.1 callbacks
115 */ 115 */
@@ -149,28 +149,9 @@ nfs41_callback_svc(void *vrqstp)
149 * Bring up the NFSv4.1 callback service 149 * Bring up the NFSv4.1 callback service
150 */ 150 */
151static struct svc_rqst * 151static struct svc_rqst *
152nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 152nfs41_callback_up(struct svc_serv *serv)
153{ 153{
154 struct svc_rqst *rqstp; 154 struct svc_rqst *rqstp;
155 int ret;
156
157 /*
158 * Create an svc_sock for the back channel service that shares the
159 * fore channel connection.
160 * Returns the input port (0) and sets the svc_serv bc_xprt on success
161 */
162 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
163 SVC_SOCK_ANONYMOUS);
164 if (ret < 0) {
165 rqstp = ERR_PTR(ret);
166 goto out;
167 }
168
169 /*
170 * Save the svc_serv in the transport so that it can
171 * be referenced when the session backchannel is initialized
172 */
173 xprt->bc_serv = serv;
174 155
175 INIT_LIST_HEAD(&serv->sv_cb_list); 156 INIT_LIST_HEAD(&serv->sv_cb_list);
176 spin_lock_init(&serv->sv_cb_lock); 157 spin_lock_init(&serv->sv_cb_lock);
@@ -180,90 +161,74 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
180 svc_xprt_put(serv->sv_bc_xprt); 161 svc_xprt_put(serv->sv_bc_xprt);
181 serv->sv_bc_xprt = NULL; 162 serv->sv_bc_xprt = NULL;
182 } 163 }
183out:
184 dprintk("--> %s return %ld\n", __func__, 164 dprintk("--> %s return %ld\n", __func__,
185 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); 165 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
186 return rqstp; 166 return rqstp;
187} 167}
188 168
189static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, 169static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
190 struct svc_serv *serv, struct rpc_xprt *xprt,
191 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) 170 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
192{ 171{
193 if (minorversion) { 172 *rqstpp = nfs41_callback_up(serv);
194 *rqstpp = nfs41_callback_up(serv, xprt); 173 *callback_svc = nfs41_callback_svc;
195 *callback_svc = nfs41_callback_svc;
196 }
197 return minorversion;
198} 174}
199 175
200static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, 176static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
201 struct nfs_callback_data *cb_info) 177 struct svc_serv *serv)
202{ 178{
203 if (minorversion) 179 if (minorversion)
204 xprt->bc_serv = cb_info->serv; 180 /*
181 * Save the svc_serv in the transport so that it can
182 * be referenced when the session backchannel is initialized
183 */
184 xprt->bc_serv = serv;
205} 185}
206#else 186#else
207static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, 187static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
208 struct svc_serv *serv, struct rpc_xprt *xprt,
209 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
210{ 188{
211 return 0; 189 return 0;
212} 190}
213 191
192static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
193 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
194{
195 *rqstpp = ERR_PTR(-ENOTSUPP);
196 *callback_svc = ERR_PTR(-ENOTSUPP);
197}
198
214static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, 199static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
215 struct nfs_callback_data *cb_info) 200 struct svc_serv *serv)
216{ 201{
217} 202}
218#endif /* CONFIG_NFS_V4_1 */ 203#endif /* CONFIG_NFS_V4_1 */
219 204
220/* 205static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
221 * Bring up the callback thread if it is not already up. 206 struct svc_serv *serv)
222 */
223int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
224{ 207{
225 struct svc_serv *serv = NULL;
226 struct svc_rqst *rqstp; 208 struct svc_rqst *rqstp;
227 int (*callback_svc)(void *vrqstp); 209 int (*callback_svc)(void *vrqstp);
228 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 210 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
229 char svc_name[12]; 211 char svc_name[12];
230 int ret = 0; 212 int ret;
231 int minorversion_setup;
232 struct net *net = &init_net;
233 213
234 mutex_lock(&nfs_callback_mutex); 214 nfs_callback_bc_serv(minorversion, xprt, serv);
235 if (cb_info->users++ || cb_info->task != NULL) {
236 nfs_callback_bc_serv(minorversion, xprt, cb_info);
237 goto out;
238 }
239 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
240 if (!serv) {
241 ret = -ENOMEM;
242 goto out_err;
243 }
244 /* As there is only one thread we need to over-ride the
245 * default maximum of 80 connections
246 */
247 serv->sv_maxconn = 1024;
248 215
249 ret = svc_bind(serv, net); 216 if (cb_info->task)
250 if (ret < 0) { 217 return 0;
251 printk(KERN_WARNING "NFS: bind callback service failed\n");
252 goto out_err;
253 }
254 218
255 minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, 219 switch (minorversion) {
256 serv, xprt, &rqstp, &callback_svc); 220 case 0:
257 if (!minorversion_setup) {
258 /* v4.0 callback setup */ 221 /* v4.0 callback setup */
259 rqstp = nfs4_callback_up(serv, xprt); 222 rqstp = nfs4_callback_up(serv);
260 callback_svc = nfs4_callback_svc; 223 callback_svc = nfs4_callback_svc;
224 break;
225 default:
226 nfs_minorversion_callback_svc_setup(serv,
227 &rqstp, &callback_svc);
261 } 228 }
262 229
263 if (IS_ERR(rqstp)) { 230 if (IS_ERR(rqstp))
264 ret = PTR_ERR(rqstp); 231 return PTR_ERR(rqstp);
265 goto out_err;
266 }
267 232
268 svc_sock_update_bufs(serv); 233 svc_sock_update_bufs(serv);
269 234
@@ -276,41 +241,165 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
276 svc_exit_thread(cb_info->rqst); 241 svc_exit_thread(cb_info->rqst);
277 cb_info->rqst = NULL; 242 cb_info->rqst = NULL;
278 cb_info->task = NULL; 243 cb_info->task = NULL;
279 goto out_err; 244 return PTR_ERR(cb_info->task);
245 }
246 dprintk("nfs_callback_up: service started\n");
247 return 0;
248}
249
250static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
251{
252 struct nfs_net *nn = net_generic(net, nfs_net_id);
253
254 if (--nn->cb_users[minorversion])
255 return;
256
257 dprintk("NFS: destroy per-net callback data; net=%p\n", net);
258 svc_shutdown_net(serv, net);
259}
260
261static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
262{
263 struct nfs_net *nn = net_generic(net, nfs_net_id);
264 int ret;
265
266 if (nn->cb_users[minorversion]++)
267 return 0;
268
269 dprintk("NFS: create per-net callback data; net=%p\n", net);
270
271 ret = svc_bind(serv, net);
272 if (ret < 0) {
273 printk(KERN_WARNING "NFS: bind callback service failed\n");
274 goto err_bind;
275 }
276
277 switch (minorversion) {
278 case 0:
279 ret = nfs4_callback_up_net(serv, net);
280 break;
281 case 1:
282 ret = nfs41_callback_up_net(serv, net);
283 break;
284 default:
285 printk(KERN_ERR "NFS: unknown callback version: %d\n",
286 minorversion);
287 ret = -EINVAL;
288 break;
289 }
290
291 if (ret < 0) {
292 printk(KERN_ERR "NFS: callback service start failed\n");
293 goto err_socks;
294 }
295 return 0;
296
297err_socks:
298 svc_rpcb_cleanup(serv, net);
299err_bind:
300 dprintk("NFS: Couldn't create callback socket: err = %d; "
301 "net = %p\n", ret, net);
302 return ret;
303}
304
305static struct svc_serv *nfs_callback_create_svc(int minorversion)
306{
307 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
308 struct svc_serv *serv;
309
310 /*
311 * Check whether we're already up and running.
312 */
313 if (cb_info->task) {
314 /*
315 * Note: increase service usage, because later in case of error
316 * svc_destroy() will be called.
317 */
318 svc_get(cb_info->serv);
319 return cb_info->serv;
320 }
321
322 /*
323 * Sanity check: if there's no task,
324 * we should be the first user ...
325 */
326 if (cb_info->users)
327 printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
328 cb_info->users);
329
330 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
331 if (!serv) {
332 printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
333 return ERR_PTR(-ENOMEM);
334 }
335 /* As there is only one thread we need to over-ride the
336 * default maximum of 80 connections
337 */
338 serv->sv_maxconn = 1024;
339 dprintk("nfs_callback_create_svc: service created\n");
340 return serv;
341}
342
343/*
344 * Bring up the callback thread if it is not already up.
345 */
346int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
347{
348 struct svc_serv *serv;
349 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
350 int ret;
351 struct net *net = xprt->xprt_net;
352
353 mutex_lock(&nfs_callback_mutex);
354
355 serv = nfs_callback_create_svc(minorversion);
356 if (IS_ERR(serv)) {
357 ret = PTR_ERR(serv);
358 goto err_create;
280 } 359 }
281out: 360
361 ret = nfs_callback_up_net(minorversion, serv, net);
362 if (ret < 0)
363 goto err_net;
364
365 ret = nfs_callback_start_svc(minorversion, xprt, serv);
366 if (ret < 0)
367 goto err_start;
368
369 cb_info->users++;
282 /* 370 /*
283 * svc_create creates the svc_serv with sv_nrthreads == 1, and then 371 * svc_create creates the svc_serv with sv_nrthreads == 1, and then
284 * svc_prepare_thread increments that. So we need to call svc_destroy 372 * svc_prepare_thread increments that. So we need to call svc_destroy
285 * on both success and failure so that the refcount is 1 when the 373 * on both success and failure so that the refcount is 1 when the
286 * thread exits. 374 * thread exits.
287 */ 375 */
288 if (serv) 376err_net:
289 svc_destroy(serv); 377 svc_destroy(serv);
378err_create:
290 mutex_unlock(&nfs_callback_mutex); 379 mutex_unlock(&nfs_callback_mutex);
291 return ret; 380 return ret;
292out_err: 381
293 dprintk("NFS: Couldn't create callback socket or server thread; " 382err_start:
294 "err = %d\n", ret); 383 nfs_callback_down_net(minorversion, serv, net);
295 cb_info->users--; 384 dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
296 if (serv) 385 goto err_net;
297 svc_shutdown_net(serv, net);
298 goto out;
299} 386}
300 387
301/* 388/*
302 * Kill the callback thread if it's no longer being used. 389 * Kill the callback thread if it's no longer being used.
303 */ 390 */
304void nfs_callback_down(int minorversion) 391void nfs_callback_down(int minorversion, struct net *net)
305{ 392{
306 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 393 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
307 394
308 mutex_lock(&nfs_callback_mutex); 395 mutex_lock(&nfs_callback_mutex);
396 nfs_callback_down_net(minorversion, cb_info->serv, net);
309 cb_info->users--; 397 cb_info->users--;
310 if (cb_info->users == 0 && cb_info->task != NULL) { 398 if (cb_info->users == 0 && cb_info->task != NULL) {
311 kthread_stop(cb_info->task); 399 kthread_stop(cb_info->task);
312 svc_shutdown_net(cb_info->serv, &init_net); 400 dprintk("nfs_callback_down: service stopped\n");
313 svc_exit_thread(cb_info->rqst); 401 svc_exit_thread(cb_info->rqst);
402 dprintk("nfs_callback_down: service destroyed\n");
314 cb_info->serv = NULL; 403 cb_info->serv = NULL;
315 cb_info->rqst = NULL; 404 cb_info->rqst = NULL;
316 cb_info->task = NULL; 405 cb_info->task = NULL;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b44d7b128b71..4251c2ae06ad 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -194,7 +194,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
194 struct cb_process_state *cps); 194 struct cb_process_state *cps);
195#if IS_ENABLED(CONFIG_NFS_V4) 195#if IS_ENABLED(CONFIG_NFS_V4)
196extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 196extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
197extern void nfs_callback_down(int minorversion); 197extern void nfs_callback_down(int minorversion, struct net *net);
198extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, 198extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
199 const nfs4_stateid *stateid); 199 const nfs4_stateid *stateid);
200extern int nfs4_set_callback_sessionid(struct nfs_client *clp); 200extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
@@ -209,6 +209,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
209 209
210extern unsigned int nfs_callback_set_tcpport; 210extern unsigned int nfs_callback_set_tcpport;
211extern unsigned short nfs_callback_tcpport; 211extern unsigned short nfs_callback_tcpport;
212extern unsigned short nfs_callback_tcpport6;
213 212
214#endif /* __LINUX_FS_NFS_CALLBACK_H */ 213#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 1b5d809a105e..76b4a7a3e559 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -122,7 +122,15 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
122 ino = igrab(lo->plh_inode); 122 ino = igrab(lo->plh_inode);
123 if (!ino) 123 if (!ino)
124 continue; 124 continue;
125 get_layout_hdr(lo); 125 spin_lock(&ino->i_lock);
126 /* Is this layout in the process of being freed? */
127 if (NFS_I(ino)->layout != lo) {
128 spin_unlock(&ino->i_lock);
129 iput(ino);
130 continue;
131 }
132 pnfs_get_layout_hdr(lo);
133 spin_unlock(&ino->i_lock);
126 return lo; 134 return lo;
127 } 135 }
128 } 136 }
@@ -158,7 +166,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
158 ino = lo->plh_inode; 166 ino = lo->plh_inode;
159 spin_lock(&ino->i_lock); 167 spin_lock(&ino->i_lock);
160 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 168 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
161 mark_matching_lsegs_invalid(lo, &free_me_list, 169 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
162 &args->cbl_range)) 170 &args->cbl_range))
163 rv = NFS4ERR_DELAY; 171 rv = NFS4ERR_DELAY;
164 else 172 else
@@ -166,7 +174,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
166 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 174 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
167 spin_unlock(&ino->i_lock); 175 spin_unlock(&ino->i_lock);
168 pnfs_free_lseg_list(&free_me_list); 176 pnfs_free_lseg_list(&free_me_list);
169 put_layout_hdr(lo); 177 pnfs_put_layout_hdr(lo);
170 iput(ino); 178 iput(ino);
171 return rv; 179 return rv;
172} 180}
@@ -196,9 +204,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
196 continue; 204 continue;
197 205
198 list_for_each_entry(lo, &server->layouts, plh_layouts) { 206 list_for_each_entry(lo, &server->layouts, plh_layouts) {
199 if (!igrab(lo->plh_inode)) 207 ino = igrab(lo->plh_inode);
208 if (ino)
209 continue;
210 spin_lock(&ino->i_lock);
211 /* Is this layout in the process of being freed? */
212 if (NFS_I(ino)->layout != lo) {
213 spin_unlock(&ino->i_lock);
214 iput(ino);
200 continue; 215 continue;
201 get_layout_hdr(lo); 216 }
217 pnfs_get_layout_hdr(lo);
218 spin_unlock(&ino->i_lock);
202 BUG_ON(!list_empty(&lo->plh_bulk_recall)); 219 BUG_ON(!list_empty(&lo->plh_bulk_recall));
203 list_add(&lo->plh_bulk_recall, &recall_list); 220 list_add(&lo->plh_bulk_recall, &recall_list);
204 } 221 }
@@ -211,12 +228,12 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
211 ino = lo->plh_inode; 228 ino = lo->plh_inode;
212 spin_lock(&ino->i_lock); 229 spin_lock(&ino->i_lock);
213 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 230 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
214 if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) 231 if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
215 rv = NFS4ERR_DELAY; 232 rv = NFS4ERR_DELAY;
216 list_del_init(&lo->plh_bulk_recall); 233 list_del_init(&lo->plh_bulk_recall);
217 spin_unlock(&ino->i_lock); 234 spin_unlock(&ino->i_lock);
218 pnfs_free_lseg_list(&free_me_list); 235 pnfs_free_lseg_list(&free_me_list);
219 put_layout_hdr(lo); 236 pnfs_put_layout_hdr(lo);
220 iput(ino); 237 iput(ino);
221 } 238 }
222 return rv; 239 return rv;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99694442b93f..8b39a42ac35e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -93,10 +93,10 @@ static struct nfs_subversion *find_nfs_version(unsigned int version)
93 spin_unlock(&nfs_version_lock); 93 spin_unlock(&nfs_version_lock);
94 return nfs; 94 return nfs;
95 } 95 }
96 }; 96 }
97 97
98 spin_unlock(&nfs_version_lock); 98 spin_unlock(&nfs_version_lock);
99 return ERR_PTR(-EPROTONOSUPPORT);; 99 return ERR_PTR(-EPROTONOSUPPORT);
100} 100}
101 101
102struct nfs_subversion *get_nfs_version(unsigned int version) 102struct nfs_subversion *get_nfs_version(unsigned int version)
@@ -498,7 +498,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
498 return nfs_found_client(cl_init, clp); 498 return nfs_found_client(cl_init, clp);
499 } 499 }
500 if (new) { 500 if (new) {
501 list_add(&new->cl_share_link, &nn->nfs_client_list); 501 list_add_tail(&new->cl_share_link,
502 &nn->nfs_client_list);
502 spin_unlock(&nn->nfs_client_lock); 503 spin_unlock(&nn->nfs_client_lock);
503 new->cl_flags = cl_init->init_flags; 504 new->cl_flags = cl_init->init_flags;
504 return rpc_ops->init_client(new, timeparms, ip_addr, 505 return rpc_ops->init_client(new, timeparms, ip_addr,
@@ -668,7 +669,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
668{ 669{
669 struct nfs_client *clp = server->nfs_client; 670 struct nfs_client *clp = server->nfs_client;
670 671
671 server->client = rpc_clone_client(clp->cl_rpcclient); 672 server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
673 pseudoflavour);
672 if (IS_ERR(server->client)) { 674 if (IS_ERR(server->client)) {
673 dprintk("%s: couldn't create rpc_client!\n", __func__); 675 dprintk("%s: couldn't create rpc_client!\n", __func__);
674 return PTR_ERR(server->client); 676 return PTR_ERR(server->client);
@@ -678,16 +680,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
678 timeo, 680 timeo,
679 sizeof(server->client->cl_timeout_default)); 681 sizeof(server->client->cl_timeout_default));
680 server->client->cl_timeout = &server->client->cl_timeout_default; 682 server->client->cl_timeout = &server->client->cl_timeout_default;
681
682 if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
683 struct rpc_auth *auth;
684
685 auth = rpcauth_create(pseudoflavour, server->client);
686 if (IS_ERR(auth)) {
687 dprintk("%s: couldn't create credcache!\n", __func__);
688 return PTR_ERR(auth);
689 }
690 }
691 server->client->cl_softrtry = 0; 683 server->client->cl_softrtry = 0;
692 if (server->flags & NFS_MOUNT_SOFT) 684 if (server->flags & NFS_MOUNT_SOFT)
693 server->client->cl_softrtry = 1; 685 server->client->cl_softrtry = 1;
@@ -761,6 +753,8 @@ static int nfs_init_server(struct nfs_server *server,
761 data->timeo, data->retrans); 753 data->timeo, data->retrans);
762 if (data->flags & NFS_MOUNT_NORESVPORT) 754 if (data->flags & NFS_MOUNT_NORESVPORT)
763 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
756 if (server->options & NFS_OPTION_MIGRATION)
757 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
764 758
765 /* Allocate or find a client reference we can use */ 759 /* Allocate or find a client reference we can use */
766 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 760 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -855,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
855 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 849 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
856 server->wsize = NFS_MAX_FILE_IO_SIZE; 850 server->wsize = NFS_MAX_FILE_IO_SIZE;
857 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 851 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
858 server->pnfs_blksize = fsinfo->blksize;
859 852
860 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 853 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
861 854
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 627f108ede23..ce8cb926526b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2072,7 +2072,7 @@ found:
2072 nfs_access_free_entry(entry); 2072 nfs_access_free_entry(entry);
2073} 2073}
2074 2074
2075static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 2075void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2076{ 2076{
2077 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); 2077 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
2078 if (cache == NULL) 2078 if (cache == NULL)
@@ -2098,6 +2098,20 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
2098 spin_unlock(&nfs_access_lru_lock); 2098 spin_unlock(&nfs_access_lru_lock);
2099 } 2099 }
2100} 2100}
2101EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2102
2103void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
2104{
2105 entry->mask = 0;
2106 if (access_result & NFS4_ACCESS_READ)
2107 entry->mask |= MAY_READ;
2108 if (access_result &
2109 (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
2110 entry->mask |= MAY_WRITE;
2111 if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2112 entry->mask |= MAY_EXEC;
2113}
2114EXPORT_SYMBOL_GPL(nfs_access_set_mask);
2101 2115
2102static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) 2116static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2103{ 2117{
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1ba385b7c90d..cae26cbd59ee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -46,6 +46,7 @@
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/task_io_accounting_ops.h> 48#include <linux/task_io_accounting_ops.h>
49#include <linux/module.h>
49 50
50#include <linux/nfs_fs.h> 51#include <linux/nfs_fs.h>
51#include <linux/nfs_page.h> 52#include <linux/nfs_page.h>
@@ -78,6 +79,7 @@ struct nfs_direct_req {
78 atomic_t io_count; /* i/os we're waiting for */ 79 atomic_t io_count; /* i/os we're waiting for */
79 spinlock_t lock; /* protect completion state */ 80 spinlock_t lock; /* protect completion state */
80 ssize_t count, /* bytes actually processed */ 81 ssize_t count, /* bytes actually processed */
82 bytes_left, /* bytes left to be sent */
81 error; /* any reported error */ 83 error; /* any reported error */
82 struct completion completion; /* wait for i/o completion */ 84 struct completion completion; /* wait for i/o completion */
83 85
@@ -190,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
190 kref_put(&dreq->kref, nfs_direct_req_free); 192 kref_put(&dreq->kref, nfs_direct_req_free);
191} 193}
192 194
195ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
196{
197 return dreq->bytes_left;
198}
199EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
200
193/* 201/*
194 * Collects and returns the final error value/byte-count. 202 * Collects and returns the final error value/byte-count.
195 */ 203 */
@@ -390,6 +398,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
390 user_addr += req_len; 398 user_addr += req_len;
391 pos += req_len; 399 pos += req_len;
392 count -= req_len; 400 count -= req_len;
401 dreq->bytes_left -= req_len;
393 } 402 }
394 /* The nfs_page now hold references to these pages */ 403 /* The nfs_page now hold references to these pages */
395 nfs_direct_release_pages(pagevec, npages); 404 nfs_direct_release_pages(pagevec, npages);
@@ -450,23 +459,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
450 ssize_t result = -ENOMEM; 459 ssize_t result = -ENOMEM;
451 struct inode *inode = iocb->ki_filp->f_mapping->host; 460 struct inode *inode = iocb->ki_filp->f_mapping->host;
452 struct nfs_direct_req *dreq; 461 struct nfs_direct_req *dreq;
462 struct nfs_lock_context *l_ctx;
453 463
454 dreq = nfs_direct_req_alloc(); 464 dreq = nfs_direct_req_alloc();
455 if (dreq == NULL) 465 if (dreq == NULL)
456 goto out; 466 goto out;
457 467
458 dreq->inode = inode; 468 dreq->inode = inode;
469 dreq->bytes_left = iov_length(iov, nr_segs);
459 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 470 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
460 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 471 l_ctx = nfs_get_lock_context(dreq->ctx);
461 if (dreq->l_ctx == NULL) 472 if (IS_ERR(l_ctx)) {
473 result = PTR_ERR(l_ctx);
462 goto out_release; 474 goto out_release;
475 }
476 dreq->l_ctx = l_ctx;
463 if (!is_sync_kiocb(iocb)) 477 if (!is_sync_kiocb(iocb))
464 dreq->iocb = iocb; 478 dreq->iocb = iocb;
465 479
480 NFS_I(inode)->read_io += iov_length(iov, nr_segs);
466 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); 481 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
467 if (!result) 482 if (!result)
468 result = nfs_direct_wait(dreq); 483 result = nfs_direct_wait(dreq);
469 NFS_I(inode)->read_io += result;
470out_release: 484out_release:
471 nfs_direct_req_release(dreq); 485 nfs_direct_req_release(dreq);
472out: 486out:
@@ -706,6 +720,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
706 user_addr += req_len; 720 user_addr += req_len;
707 pos += req_len; 721 pos += req_len;
708 count -= req_len; 722 count -= req_len;
723 dreq->bytes_left -= req_len;
709 } 724 }
710 /* The nfs_page now hold references to these pages */ 725 /* The nfs_page now hold references to these pages */
711 nfs_direct_release_pages(pagevec, npages); 726 nfs_direct_release_pages(pagevec, npages);
@@ -814,6 +829,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
814 get_dreq(dreq); 829 get_dreq(dreq);
815 atomic_inc(&inode->i_dio_count); 830 atomic_inc(&inode->i_dio_count);
816 831
832 NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
817 for (seg = 0; seg < nr_segs; seg++) { 833 for (seg = 0; seg < nr_segs; seg++) {
818 const struct iovec *vec = &iov[seg]; 834 const struct iovec *vec = &iov[seg];
819 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); 835 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
@@ -825,7 +841,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
825 pos += vec->iov_len; 841 pos += vec->iov_len;
826 } 842 }
827 nfs_pageio_complete(&desc); 843 nfs_pageio_complete(&desc);
828 NFS_I(dreq->inode)->write_io += desc.pg_bytes_written;
829 844
830 /* 845 /*
831 * If no bytes were started, return the error, and let the 846 * If no bytes were started, return the error, and let the
@@ -849,16 +864,21 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
849 ssize_t result = -ENOMEM; 864 ssize_t result = -ENOMEM;
850 struct inode *inode = iocb->ki_filp->f_mapping->host; 865 struct inode *inode = iocb->ki_filp->f_mapping->host;
851 struct nfs_direct_req *dreq; 866 struct nfs_direct_req *dreq;
867 struct nfs_lock_context *l_ctx;
852 868
853 dreq = nfs_direct_req_alloc(); 869 dreq = nfs_direct_req_alloc();
854 if (!dreq) 870 if (!dreq)
855 goto out; 871 goto out;
856 872
857 dreq->inode = inode; 873 dreq->inode = inode;
874 dreq->bytes_left = count;
858 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 875 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
859 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 876 l_ctx = nfs_get_lock_context(dreq->ctx);
860 if (dreq->l_ctx == NULL) 877 if (IS_ERR(l_ctx)) {
878 result = PTR_ERR(l_ctx);
861 goto out_release; 879 goto out_release;
880 }
881 dreq->l_ctx = l_ctx;
862 if (!is_sync_kiocb(iocb)) 882 if (!is_sync_kiocb(iocb))
863 dreq->iocb = iocb; 883 dreq->iocb = iocb;
864 884
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 75d6d0a3d32e..582bb8866131 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -259,7 +259,7 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
259 struct dentry *dentry = file->f_path.dentry; 259 struct dentry *dentry = file->f_path.dentry;
260 struct nfs_open_context *ctx = nfs_file_open_context(file); 260 struct nfs_open_context *ctx = nfs_file_open_context(file);
261 struct inode *inode = dentry->d_inode; 261 struct inode *inode = dentry->d_inode;
262 int have_error, status; 262 int have_error, do_resend, status;
263 int ret = 0; 263 int ret = 0;
264 264
265 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 265 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
@@ -267,15 +267,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
267 datasync); 267 datasync);
268 268
269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
270 do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
270 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 271 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
271 status = nfs_commit_inode(inode, FLUSH_SYNC); 272 status = nfs_commit_inode(inode, FLUSH_SYNC);
272 if (status >= 0 && ret < 0)
273 status = ret;
274 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 273 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
275 if (have_error) 274 if (have_error) {
276 ret = xchg(&ctx->error, 0); 275 ret = xchg(&ctx->error, 0);
277 if (!ret && status < 0) 276 if (ret)
277 goto out;
278 }
279 if (status < 0) {
278 ret = status; 280 ret = status;
281 goto out;
282 }
283 do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
284 if (do_resend)
285 ret = -EAGAIN;
286out:
279 return ret; 287 return ret;
280} 288}
281EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); 289EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
@@ -286,10 +294,21 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
286 int ret; 294 int ret;
287 struct inode *inode = file->f_path.dentry->d_inode; 295 struct inode *inode = file->f_path.dentry->d_inode;
288 296
289 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 297 do {
290 mutex_lock(&inode->i_mutex); 298 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
291 ret = nfs_file_fsync_commit(file, start, end, datasync); 299 if (ret != 0)
292 mutex_unlock(&inode->i_mutex); 300 break;
301 mutex_lock(&inode->i_mutex);
302 ret = nfs_file_fsync_commit(file, start, end, datasync);
303 mutex_unlock(&inode->i_mutex);
304 /*
305 * If nfs_file_fsync_commit detected a server reboot, then
306 * resend all dirty pages that might have been covered by
307 * the NFS_CONTEXT_RESEND_WRITES flag
308 */
309 start = 0;
310 end = LLONG_MAX;
311 } while (ret == -EAGAIN);
293 312
294 return ret; 313 return ret;
295} 314}
@@ -576,6 +595,7 @@ out:
576static const struct vm_operations_struct nfs_file_vm_ops = { 595static const struct vm_operations_struct nfs_file_vm_ops = {
577 .fault = filemap_fault, 596 .fault = filemap_fault,
578 .page_mkwrite = nfs_vm_page_mkwrite, 597 .page_mkwrite = nfs_vm_page_mkwrite,
598 .remap_pages = generic_file_remap_pages,
579}; 599};
580 600
581static int nfs_need_sync_write(struct file *filp, struct inode *inode) 601static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4654ced096a6..033803c36644 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -32,6 +32,8 @@
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34 34
35#include "internal.h"
36
35#define NFSDBG_FACILITY NFSDBG_CLIENT 37#define NFSDBG_FACILITY NFSDBG_CLIENT
36 38
37/* 39/*
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index a850079467d8..9cc4a3fbf4b0 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -55,18 +55,19 @@
55static const struct cred *id_resolver_cache; 55static const struct cred *id_resolver_cache;
56static struct key_type key_type_id_resolver_legacy; 56static struct key_type key_type_id_resolver_legacy;
57 57
58struct idmap {
59 struct rpc_pipe *idmap_pipe;
60 struct key_construction *idmap_key_cons;
61 struct mutex idmap_mutex;
62};
63
64struct idmap_legacy_upcalldata { 58struct idmap_legacy_upcalldata {
65 struct rpc_pipe_msg pipe_msg; 59 struct rpc_pipe_msg pipe_msg;
66 struct idmap_msg idmap_msg; 60 struct idmap_msg idmap_msg;
61 struct key_construction *key_cons;
67 struct idmap *idmap; 62 struct idmap *idmap;
68}; 63};
69 64
65struct idmap {
66 struct rpc_pipe *idmap_pipe;
67 struct idmap_legacy_upcalldata *idmap_upcall_data;
68 struct mutex idmap_mutex;
69};
70
70/** 71/**
71 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields 72 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
72 * @fattr: fully initialised struct nfs_fattr 73 * @fattr: fully initialised struct nfs_fattr
@@ -158,7 +159,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
158 return 0; 159 return 0;
159 memcpy(buf, name, namelen); 160 memcpy(buf, name, namelen);
160 buf[namelen] = '\0'; 161 buf[namelen] = '\0';
161 if (strict_strtoul(buf, 0, &val) != 0) 162 if (kstrtoul(buf, 0, &val) != 0)
162 return 0; 163 return 0;
163 *res = val; 164 *res = val;
164 return 1; 165 return 1;
@@ -330,7 +331,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
330 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, 331 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
331 name, namelen, type, data, 332 name, namelen, type, data,
332 data_size, idmap); 333 data_size, idmap);
333 idmap->idmap_key_cons = NULL;
334 mutex_unlock(&idmap->idmap_mutex); 334 mutex_unlock(&idmap->idmap_mutex);
335 } 335 }
336 return ret; 336 return ret;
@@ -364,7 +364,7 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ
364 if (data_size <= 0) { 364 if (data_size <= 0) {
365 ret = -EINVAL; 365 ret = -EINVAL;
366 } else { 366 } else {
367 ret = strict_strtol(id_str, 10, &id_long); 367 ret = kstrtol(id_str, 10, &id_long);
368 *id = (__u32)id_long; 368 *id = (__u32)id_long;
369 } 369 }
370 return ret; 370 return ret;
@@ -465,8 +465,6 @@ nfs_idmap_new(struct nfs_client *clp)
465 struct rpc_pipe *pipe; 465 struct rpc_pipe *pipe;
466 int error; 466 int error;
467 467
468 BUG_ON(clp->cl_idmap != NULL);
469
470 idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); 468 idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
471 if (idmap == NULL) 469 if (idmap == NULL)
472 return -ENOMEM; 470 return -ENOMEM;
@@ -510,7 +508,6 @@ static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
510 508
511 switch (event) { 509 switch (event) {
512 case RPC_PIPEFS_MOUNT: 510 case RPC_PIPEFS_MOUNT:
513 BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
514 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, 511 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
515 clp->cl_idmap, 512 clp->cl_idmap,
516 clp->cl_idmap->idmap_pipe); 513 clp->cl_idmap->idmap_pipe);
@@ -632,9 +629,6 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
632 substring_t substr; 629 substring_t substr;
633 int token, ret; 630 int token, ret;
634 631
635 memset(im, 0, sizeof(*im));
636 memset(msg, 0, sizeof(*msg));
637
638 im->im_type = IDMAP_TYPE_GROUP; 632 im->im_type = IDMAP_TYPE_GROUP;
639 token = match_token(desc, nfs_idmap_tokens, &substr); 633 token = match_token(desc, nfs_idmap_tokens, &substr);
640 634
@@ -665,6 +659,35 @@ out:
665 return ret; 659 return ret;
666} 660}
667 661
662static bool
663nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
664 struct idmap_legacy_upcalldata *data)
665{
666 if (idmap->idmap_upcall_data != NULL) {
667 WARN_ON_ONCE(1);
668 return false;
669 }
670 idmap->idmap_upcall_data = data;
671 return true;
672}
673
674static void
675nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
676{
677 struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
678
679 kfree(idmap->idmap_upcall_data);
680 idmap->idmap_upcall_data = NULL;
681 complete_request_key(cons, ret);
682}
683
684static void
685nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
686{
687 if (idmap->idmap_upcall_data != NULL)
688 nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
689}
690
668static int nfs_idmap_legacy_upcall(struct key_construction *cons, 691static int nfs_idmap_legacy_upcall(struct key_construction *cons,
669 const char *op, 692 const char *op,
670 void *aux) 693 void *aux)
@@ -677,29 +700,28 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
677 int ret = -ENOMEM; 700 int ret = -ENOMEM;
678 701
679 /* msg and im are freed in idmap_pipe_destroy_msg */ 702 /* msg and im are freed in idmap_pipe_destroy_msg */
680 data = kmalloc(sizeof(*data), GFP_KERNEL); 703 data = kzalloc(sizeof(*data), GFP_KERNEL);
681 if (!data) 704 if (!data)
682 goto out1; 705 goto out1;
683 706
684 msg = &data->pipe_msg; 707 msg = &data->pipe_msg;
685 im = &data->idmap_msg; 708 im = &data->idmap_msg;
686 data->idmap = idmap; 709 data->idmap = idmap;
710 data->key_cons = cons;
687 711
688 ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); 712 ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
689 if (ret < 0) 713 if (ret < 0)
690 goto out2; 714 goto out2;
691 715
692 BUG_ON(idmap->idmap_key_cons != NULL); 716 ret = -EAGAIN;
693 idmap->idmap_key_cons = cons; 717 if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
718 goto out2;
694 719
695 ret = rpc_queue_upcall(idmap->idmap_pipe, msg); 720 ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
696 if (ret < 0) 721 if (ret < 0)
697 goto out3; 722 nfs_idmap_abort_pipe_upcall(idmap, ret);
698 723
699 return ret; 724 return ret;
700
701out3:
702 idmap->idmap_key_cons = NULL;
703out2: 725out2:
704 kfree(data); 726 kfree(data);
705out1: 727out1:
@@ -714,21 +736,32 @@ static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *dat
714 authkey); 736 authkey);
715} 737}
716 738
717static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) 739static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
740 struct idmap_msg *upcall,
741 struct key *key, struct key *authkey)
718{ 742{
719 char id_str[NFS_UINT_MAXLEN]; 743 char id_str[NFS_UINT_MAXLEN];
720 int ret = -EINVAL; 744 int ret = -ENOKEY;
721 745
746 /* ret = -ENOKEY */
747 if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
748 goto out;
722 switch (im->im_conv) { 749 switch (im->im_conv) {
723 case IDMAP_CONV_NAMETOID: 750 case IDMAP_CONV_NAMETOID:
751 if (strcmp(upcall->im_name, im->im_name) != 0)
752 break;
724 sprintf(id_str, "%d", im->im_id); 753 sprintf(id_str, "%d", im->im_id);
725 ret = nfs_idmap_instantiate(key, authkey, id_str); 754 ret = nfs_idmap_instantiate(key, authkey, id_str);
726 break; 755 break;
727 case IDMAP_CONV_IDTONAME: 756 case IDMAP_CONV_IDTONAME:
757 if (upcall->im_id != im->im_id)
758 break;
728 ret = nfs_idmap_instantiate(key, authkey, im->im_name); 759 ret = nfs_idmap_instantiate(key, authkey, im->im_name);
729 break; 760 break;
761 default:
762 ret = -EINVAL;
730 } 763 }
731 764out:
732 return ret; 765 return ret;
733} 766}
734 767
@@ -740,14 +773,16 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
740 struct key_construction *cons; 773 struct key_construction *cons;
741 struct idmap_msg im; 774 struct idmap_msg im;
742 size_t namelen_in; 775 size_t namelen_in;
743 int ret; 776 int ret = -ENOKEY;
744 777
745 /* If instantiation is successful, anyone waiting for key construction 778 /* If instantiation is successful, anyone waiting for key construction
746 * will have been woken up and someone else may now have used 779 * will have been woken up and someone else may now have used
747 * idmap_key_cons - so after this point we may no longer touch it. 780 * idmap_key_cons - so after this point we may no longer touch it.
748 */ 781 */
749 cons = ACCESS_ONCE(idmap->idmap_key_cons); 782 if (idmap->idmap_upcall_data == NULL)
750 idmap->idmap_key_cons = NULL; 783 goto out_noupcall;
784
785 cons = idmap->idmap_upcall_data->key_cons;
751 786
752 if (mlen != sizeof(im)) { 787 if (mlen != sizeof(im)) {
753 ret = -ENOSPC; 788 ret = -ENOSPC;
@@ -768,16 +803,19 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
768 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { 803 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
769 ret = -EINVAL; 804 ret = -EINVAL;
770 goto out; 805 goto out;
771 } 806}
772 807
773 ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); 808 ret = nfs_idmap_read_and_verify_message(&im,
809 &idmap->idmap_upcall_data->idmap_msg,
810 cons->key, cons->authkey);
774 if (ret >= 0) { 811 if (ret >= 0) {
775 key_set_timeout(cons->key, nfs_idmap_cache_timeout); 812 key_set_timeout(cons->key, nfs_idmap_cache_timeout);
776 ret = mlen; 813 ret = mlen;
777 } 814 }
778 815
779out: 816out:
780 complete_request_key(cons, ret); 817 nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
818out_noupcall:
781 return ret; 819 return ret;
782} 820}
783 821
@@ -788,14 +826,9 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
788 struct idmap_legacy_upcalldata, 826 struct idmap_legacy_upcalldata,
789 pipe_msg); 827 pipe_msg);
790 struct idmap *idmap = data->idmap; 828 struct idmap *idmap = data->idmap;
791 struct key_construction *cons; 829
792 if (msg->errno) { 830 if (msg->errno)
793 cons = ACCESS_ONCE(idmap->idmap_key_cons); 831 nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
794 idmap->idmap_key_cons = NULL;
795 complete_request_key(cons, msg->errno);
796 }
797 /* Free memory allocated in nfs_idmap_legacy_upcall() */
798 kfree(data);
799} 832}
800 833
801static void 834static void
@@ -803,7 +836,8 @@ idmap_release_pipe(struct inode *inode)
803{ 836{
804 struct rpc_inode *rpci = RPC_I(inode); 837 struct rpc_inode *rpci = RPC_I(inode);
805 struct idmap *idmap = (struct idmap *)rpci->private; 838 struct idmap *idmap = (struct idmap *)rpci->private;
806 idmap->idmap_key_cons = NULL; 839
840 nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
807} 841}
808 842
809int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) 843int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c6e895f0fbf3..5c7325c5c5e6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -154,7 +154,7 @@ static void nfs_zap_caches_locked(struct inode *inode)
154 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 154 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
155 nfsi->attrtimeo_timestamp = jiffies; 155 nfsi->attrtimeo_timestamp = jiffies;
156 156
157 memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); 157 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
158 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) 158 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
159 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 159 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
160 else 160 else
@@ -547,8 +547,8 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
547static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) 547static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
548{ 548{
549 atomic_set(&l_ctx->count, 1); 549 atomic_set(&l_ctx->count, 1);
550 l_ctx->lockowner = current->files; 550 l_ctx->lockowner.l_owner = current->files;
551 l_ctx->pid = current->tgid; 551 l_ctx->lockowner.l_pid = current->tgid;
552 INIT_LIST_HEAD(&l_ctx->list); 552 INIT_LIST_HEAD(&l_ctx->list);
553} 553}
554 554
@@ -557,9 +557,9 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
557 struct nfs_lock_context *pos; 557 struct nfs_lock_context *pos;
558 558
559 list_for_each_entry(pos, &ctx->lock_context.list, list) { 559 list_for_each_entry(pos, &ctx->lock_context.list, list) {
560 if (pos->lockowner != current->files) 560 if (pos->lockowner.l_owner != current->files)
561 continue; 561 continue;
562 if (pos->pid != current->tgid) 562 if (pos->lockowner.l_pid != current->tgid)
563 continue; 563 continue;
564 atomic_inc(&pos->count); 564 atomic_inc(&pos->count);
565 return pos; 565 return pos;
@@ -578,7 +578,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
578 spin_unlock(&inode->i_lock); 578 spin_unlock(&inode->i_lock);
579 new = kmalloc(sizeof(*new), GFP_KERNEL); 579 new = kmalloc(sizeof(*new), GFP_KERNEL);
580 if (new == NULL) 580 if (new == NULL)
581 return NULL; 581 return ERR_PTR(-ENOMEM);
582 nfs_init_lock_context(new); 582 nfs_init_lock_context(new);
583 spin_lock(&inode->i_lock); 583 spin_lock(&inode->i_lock);
584 res = __nfs_find_lock_context(ctx); 584 res = __nfs_find_lock_context(ctx);
@@ -1571,6 +1571,11 @@ static int __init nfs_init_inodecache(void)
1571 1571
1572static void nfs_destroy_inodecache(void) 1572static void nfs_destroy_inodecache(void)
1573{ 1573{
1574 /*
1575 * Make sure all delayed rcu free inodes are flushed before we
1576 * destroy cache.
1577 */
1578 rcu_barrier();
1574 kmem_cache_destroy(nfs_inode_cachep); 1579 kmem_cache_destroy(nfs_inode_cachep);
1575} 1580}
1576 1581
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31fdb03225cd..59b133c5d652 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -101,11 +101,11 @@ struct nfs_client_initdata {
101 */ 101 */
102struct nfs_parsed_mount_data { 102struct nfs_parsed_mount_data {
103 int flags; 103 int flags;
104 int rsize, wsize; 104 unsigned int rsize, wsize;
105 int timeo, retrans; 105 unsigned int timeo, retrans;
106 int acregmin, acregmax, 106 unsigned int acregmin, acregmax,
107 acdirmin, acdirmax; 107 acdirmin, acdirmax;
108 int namlen; 108 unsigned int namlen;
109 unsigned int options; 109 unsigned int options;
110 unsigned int bsize; 110 unsigned int bsize;
111 unsigned int auth_flavor_len; 111 unsigned int auth_flavor_len;
@@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
464{ 464{
465 inode_dio_wait(inode); 465 inode_dio_wait(inode);
466} 466}
467extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
467 468
468/* nfs4proc.c */ 469/* nfs4proc.c */
469extern void __nfs4_read_done_cb(struct nfs_read_data *); 470extern void __nfs4_read_done_cb(struct nfs_read_data *);
@@ -483,6 +484,12 @@ extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
483 struct nfs4_sequence_args *args, 484 struct nfs4_sequence_args *args,
484 struct nfs4_sequence_res *res, 485 struct nfs4_sequence_res *res,
485 int cache_reply); 486 int cache_reply);
487extern int nfs40_walk_client_list(struct nfs_client *clp,
488 struct nfs_client **result,
489 struct rpc_cred *cred);
490extern int nfs41_walk_client_list(struct nfs_client *clp,
491 struct nfs_client **result,
492 struct rpc_cred *cred);
486 493
487/* 494/*
488 * Determine the device name as a string 495 * Determine the device name as a string
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 0539de1b8d1f..8ee1fab83268 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -5,6 +5,7 @@
5#ifndef __NFS_NETNS_H__ 5#ifndef __NFS_NETNS_H__
6#define __NFS_NETNS_H__ 6#define __NFS_NETNS_H__
7 7
8#include <linux/nfs4.h>
8#include <net/net_namespace.h> 9#include <net/net_namespace.h>
9#include <net/netns/generic.h> 10#include <net/netns/generic.h>
10 11
@@ -22,6 +23,9 @@ struct nfs_net {
22 struct list_head nfs_volume_list; 23 struct list_head nfs_volume_list;
23#if IS_ENABLED(CONFIG_NFS_V4) 24#if IS_ENABLED(CONFIG_NFS_V4)
24 struct idr cb_ident_idr; /* Protected by nfs_client_lock */ 25 struct idr cb_ident_idr; /* Protected by nfs_client_lock */
26 unsigned short nfs_callback_tcpport;
27 unsigned short nfs_callback_tcpport6;
28 int cb_users[NFS4_MAX_MINOR_VERSION + 1];
25#endif 29#endif
26 spinlock_t nfs_client_lock; 30 spinlock_t nfs_client_lock;
27 struct timespec boot_time; 31 struct timespec boot_time;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index e4498dc351a8..4a1aafba6a20 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -70,7 +70,7 @@ ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
70 if (type == ACL_TYPE_ACCESS && acl->a_count == 0) 70 if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
71 error = -ENODATA; 71 error = -ENODATA;
72 else 72 else
73 error = posix_acl_to_xattr(acl, buffer, size); 73 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
74 posix_acl_release(acl); 74 posix_acl_release(acl);
75 } else 75 } else
76 error = -ENODATA; 76 error = -ENODATA;
@@ -92,7 +92,7 @@ int nfs3_setxattr(struct dentry *dentry, const char *name,
92 else 92 else
93 return -EOPNOTSUPP; 93 return -EOPNOTSUPP;
94 94
95 acl = posix_acl_from_xattr(value, size); 95 acl = posix_acl_from_xattr(&init_user_ns, value, size);
96 if (IS_ERR(acl)) 96 if (IS_ERR(acl))
97 return PTR_ERR(acl); 97 return PTR_ERR(acl);
98 error = nfs3_proc_setacl(inode, type, acl); 98 error = nfs3_proc_setacl(inode, type, acl);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d6b3b5f2d779..69322096c325 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -643,7 +643,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
643 u64 cookie, struct page **pages, unsigned int count, int plus) 643 u64 cookie, struct page **pages, unsigned int count, int plus)
644{ 644{
645 struct inode *dir = dentry->d_inode; 645 struct inode *dir = dentry->d_inode;
646 __be32 *verf = NFS_COOKIEVERF(dir); 646 __be32 *verf = NFS_I(dir)->cookieverf;
647 struct nfs3_readdirargs arg = { 647 struct nfs3_readdirargs arg = {
648 .fh = NFS_FH(dir), 648 .fh = NFS_FH(dir),
649 .cookie = cookie, 649 .cookie = cookie,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index da0618aeeadb..a525fdefccde 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -132,8 +132,8 @@ struct nfs4_lock_owner {
132struct nfs4_lock_state { 132struct nfs4_lock_state {
133 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
134 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
135#define NFS_LOCK_INITIALIZED 1 135#define NFS_LOCK_INITIALIZED 0
136 int ls_flags; 136 unsigned long ls_flags;
137 struct nfs_seqid_counter ls_seqid; 137 struct nfs_seqid_counter ls_seqid;
138 nfs4_stateid ls_stateid; 138 nfs4_stateid ls_stateid;
139 atomic_t ls_count; 139 atomic_t ls_count;
@@ -191,6 +191,8 @@ struct nfs4_state_recovery_ops {
191 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 191 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
192 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 192 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
193 int (*reclaim_complete)(struct nfs_client *); 193 int (*reclaim_complete)(struct nfs_client *);
194 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
195 struct rpc_cred *);
194}; 196};
195 197
196struct nfs4_state_maintenance_ops { 198struct nfs4_state_maintenance_ops {
@@ -223,7 +225,7 @@ extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
223extern int nfs4_destroy_clientid(struct nfs_client *clp); 225extern int nfs4_destroy_clientid(struct nfs_client *clp);
224extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 226extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
225extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 227extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
226extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); 228extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
227extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 229extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
228extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, 230extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
229 struct nfs4_fs_locations *, struct page *); 231 struct nfs4_fs_locations *, struct page *);
@@ -320,9 +322,15 @@ extern void nfs4_renew_state(struct work_struct *);
320/* nfs4state.c */ 322/* nfs4state.c */
321struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); 323struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
322struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 324struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
325int nfs4_discover_server_trunking(struct nfs_client *clp,
326 struct nfs_client **);
327int nfs40_discover_server_trunking(struct nfs_client *clp,
328 struct nfs_client **, struct rpc_cred *);
323#if defined(CONFIG_NFS_V4_1) 329#if defined(CONFIG_NFS_V4_1)
324struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); 330struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
325struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 331struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
332int nfs41_discover_server_trunking(struct nfs_client *clp,
333 struct nfs_client **, struct rpc_cred *);
326extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 334extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
327#else 335#else
328static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 336static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
@@ -351,7 +359,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
351extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 359extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
352extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 360extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
353extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, 361extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
354 fmode_t, fl_owner_t, pid_t); 362 fmode_t, const struct nfs_lockowner *);
355 363
356extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 364extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
357extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 365extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -372,6 +380,9 @@ extern bool nfs4_disable_idmapping;
372extern unsigned short max_session_slots; 380extern unsigned short max_session_slots;
373extern unsigned short send_implementation_id; 381extern unsigned short send_implementation_id;
374 382
383#define NFS4_CLIENT_ID_UNIQ_LEN (64)
384extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
385
375/* nfs4sysctl.c */ 386/* nfs4sysctl.c */
376#ifdef CONFIG_SYSCTL 387#ifdef CONFIG_SYSCTL
377int nfs4_register_sysctl(void); 388int nfs4_register_sysctl(void);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 24eb663f8ed5..6bacfde1319a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -84,7 +84,7 @@ error:
84static void nfs4_destroy_callback(struct nfs_client *clp) 84static void nfs4_destroy_callback(struct nfs_client *clp)
85{ 85{
86 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 86 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
87 nfs_callback_down(clp->cl_mvops->minor_version); 87 nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
88} 88}
89 89
90static void nfs4_shutdown_client(struct nfs_client *clp) 90static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -185,6 +185,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
185 rpc_authflavor_t authflavour) 185 rpc_authflavor_t authflavour)
186{ 186{
187 char buf[INET6_ADDRSTRLEN + 1]; 187 char buf[INET6_ADDRSTRLEN + 1];
188 struct nfs_client *old;
188 int error; 189 int error;
189 190
190 if (clp->cl_cons_state == NFS_CS_READY) { 191 if (clp->cl_cons_state == NFS_CS_READY) {
@@ -230,6 +231,17 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
230 231
231 if (!nfs4_has_session(clp)) 232 if (!nfs4_has_session(clp))
232 nfs_mark_client_ready(clp, NFS_CS_READY); 233 nfs_mark_client_ready(clp, NFS_CS_READY);
234
235 error = nfs4_discover_server_trunking(clp, &old);
236 if (error < 0)
237 goto error;
238 if (clp != old) {
239 clp->cl_preserve_clid = true;
240 nfs_put_client(clp);
241 clp = old;
242 atomic_inc(&clp->cl_count);
243 }
244
233 return clp; 245 return clp;
234 246
235error: 247error:
@@ -239,6 +251,248 @@ error:
239 return ERR_PTR(error); 251 return ERR_PTR(error);
240} 252}
241 253
254/*
255 * SETCLIENTID just did a callback update with the callback ident in
256 * "drop," but server trunking discovery claims "drop" and "keep" are
257 * actually the same server. Swap the callback IDs so that "keep"
258 * will continue to use the callback ident the server now knows about,
259 * and so that "keep"'s original callback ident is destroyed when
260 * "drop" is freed.
261 */
262static void nfs4_swap_callback_idents(struct nfs_client *keep,
263 struct nfs_client *drop)
264{
265 struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
266 unsigned int save = keep->cl_cb_ident;
267
268 if (keep->cl_cb_ident == drop->cl_cb_ident)
269 return;
270
271 dprintk("%s: keeping callback ident %u and dropping ident %u\n",
272 __func__, keep->cl_cb_ident, drop->cl_cb_ident);
273
274 spin_lock(&nn->nfs_client_lock);
275
276 idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
277 keep->cl_cb_ident = drop->cl_cb_ident;
278
279 idr_replace(&nn->cb_ident_idr, drop, save);
280 drop->cl_cb_ident = save;
281
282 spin_unlock(&nn->nfs_client_lock);
283}
284
285/**
286 * nfs40_walk_client_list - Find server that recognizes a client ID
287 *
288 * @new: nfs_client with client ID to test
289 * @result: OUT: found nfs_client, or new
290 * @cred: credential to use for trunking test
291 *
292 * Returns zero, a negative errno, or a negative NFS4ERR status.
293 * If zero is returned, an nfs_client pointer is planted in "result."
294 *
295 * NB: nfs40_walk_client_list() relies on the new nfs_client being
296 * the last nfs_client on the list.
297 */
298int nfs40_walk_client_list(struct nfs_client *new,
299 struct nfs_client **result,
300 struct rpc_cred *cred)
301{
302 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
303 struct nfs_client *pos, *n, *prev = NULL;
304 struct nfs4_setclientid_res clid = {
305 .clientid = new->cl_clientid,
306 .confirm = new->cl_confirm,
307 };
308 int status;
309
310 spin_lock(&nn->nfs_client_lock);
311 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
312 /* If "pos" isn't marked ready, we can't trust the
313 * remaining fields in "pos" */
314 if (pos->cl_cons_state < NFS_CS_READY)
315 continue;
316
317 if (pos->rpc_ops != new->rpc_ops)
318 continue;
319
320 if (pos->cl_proto != new->cl_proto)
321 continue;
322
323 if (pos->cl_minorversion != new->cl_minorversion)
324 continue;
325
326 if (pos->cl_clientid != new->cl_clientid)
327 continue;
328
329 atomic_inc(&pos->cl_count);
330 spin_unlock(&nn->nfs_client_lock);
331
332 if (prev)
333 nfs_put_client(prev);
334
335 status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
336 if (status == 0) {
337 nfs4_swap_callback_idents(pos, new);
338
339 nfs_put_client(pos);
340 *result = pos;
341 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
342 __func__, pos, atomic_read(&pos->cl_count));
343 return 0;
344 }
345 if (status != -NFS4ERR_STALE_CLIENTID) {
346 nfs_put_client(pos);
347 dprintk("NFS: <-- %s status = %d, no result\n",
348 __func__, status);
349 return status;
350 }
351
352 spin_lock(&nn->nfs_client_lock);
353 prev = pos;
354 }
355
356 /*
357 * No matching nfs_client found. This should be impossible,
358 * because the new nfs_client has already been added to
359 * nfs_client_list by nfs_get_client().
360 *
361 * Don't BUG(), since the caller is holding a mutex.
362 */
363 if (prev)
364 nfs_put_client(prev);
365 spin_unlock(&nn->nfs_client_lock);
366 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
367 return -NFS4ERR_STALE_CLIENTID;
368}
369
370#ifdef CONFIG_NFS_V4_1
371/*
372 * Returns true if the client IDs match
373 */
374static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
375{
376 if (a->cl_clientid != b->cl_clientid) {
377 dprintk("NFS: --> %s client ID %llx does not match %llx\n",
378 __func__, a->cl_clientid, b->cl_clientid);
379 return false;
380 }
381 dprintk("NFS: --> %s client ID %llx matches %llx\n",
382 __func__, a->cl_clientid, b->cl_clientid);
383 return true;
384}
385
386/*
387 * Returns true if the server owners match
388 */
389static bool
390nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
391{
392 struct nfs41_server_owner *o1 = a->cl_serverowner;
393 struct nfs41_server_owner *o2 = b->cl_serverowner;
394
395 if (o1->minor_id != o2->minor_id) {
396 dprintk("NFS: --> %s server owner minor IDs do not match\n",
397 __func__);
398 return false;
399 }
400
401 if (o1->major_id_sz != o2->major_id_sz)
402 goto out_major_mismatch;
403 if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
404 goto out_major_mismatch;
405
406 dprintk("NFS: --> %s server owners match\n", __func__);
407 return true;
408
409out_major_mismatch:
410 dprintk("NFS: --> %s server owner major IDs do not match\n",
411 __func__);
412 return false;
413}
414
415/**
416 * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
417 *
418 * @new: nfs_client with client ID to test
419 * @result: OUT: found nfs_client, or new
420 * @cred: credential to use for trunking test
421 *
422 * Returns zero, a negative errno, or a negative NFS4ERR status.
423 * If zero is returned, an nfs_client pointer is planted in "result."
424 *
425 * NB: nfs41_walk_client_list() relies on the new nfs_client being
426 * the last nfs_client on the list.
427 */
428int nfs41_walk_client_list(struct nfs_client *new,
429 struct nfs_client **result,
430 struct rpc_cred *cred)
431{
432 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
433 struct nfs_client *pos, *n, *prev = NULL;
434 int error;
435
436 spin_lock(&nn->nfs_client_lock);
437 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
438 /* If "pos" isn't marked ready, we can't trust the
439 * remaining fields in "pos", especially the client
440 * ID and serverowner fields. Wait for CREATE_SESSION
441 * to finish. */
442 if (pos->cl_cons_state < NFS_CS_READY) {
443 atomic_inc(&pos->cl_count);
444 spin_unlock(&nn->nfs_client_lock);
445
446 if (prev)
447 nfs_put_client(prev);
448 prev = pos;
449
450 error = nfs_wait_client_init_complete(pos);
451 if (error < 0) {
452 nfs_put_client(pos);
453 spin_lock(&nn->nfs_client_lock);
454 continue;
455 }
456
457 spin_lock(&nn->nfs_client_lock);
458 }
459
460 if (pos->rpc_ops != new->rpc_ops)
461 continue;
462
463 if (pos->cl_proto != new->cl_proto)
464 continue;
465
466 if (pos->cl_minorversion != new->cl_minorversion)
467 continue;
468
469 if (!nfs4_match_clientids(pos, new))
470 continue;
471
472 if (!nfs4_match_serverowners(pos, new))
473 continue;
474
475 spin_unlock(&nn->nfs_client_lock);
476 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
477 __func__, pos, atomic_read(&pos->cl_count));
478
479 *result = pos;
480 return 0;
481 }
482
483 /*
484 * No matching nfs_client found. This should be impossible,
485 * because the new nfs_client has already been added to
486 * nfs_client_list by nfs_get_client().
487 *
488 * Don't BUG(), since the caller is holding a mutex.
489 */
490 spin_unlock(&nn->nfs_client_lock);
491 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
492 return -NFS4ERR_STALE_CLIENTID;
493}
494#endif /* CONFIG_NFS_V4_1 */
495
242static void nfs4_destroy_server(struct nfs_server *server) 496static void nfs4_destroy_server(struct nfs_server *server)
243{ 497{
244 nfs_server_return_all_delegations(server); 498 nfs_server_return_all_delegations(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index acb65e7887f8..afddd6639afb 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -95,13 +95,24 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
95 int ret; 95 int ret;
96 struct inode *inode = file->f_path.dentry->d_inode; 96 struct inode *inode = file->f_path.dentry->d_inode;
97 97
98 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 98 do {
99 mutex_lock(&inode->i_mutex); 99 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
100 ret = nfs_file_fsync_commit(file, start, end, datasync); 100 if (ret != 0)
101 if (!ret && !datasync) 101 break;
102 /* application has asked for meta-data sync */ 102 mutex_lock(&inode->i_mutex);
103 ret = pnfs_layoutcommit_inode(inode, true); 103 ret = nfs_file_fsync_commit(file, start, end, datasync);
104 mutex_unlock(&inode->i_mutex); 104 if (!ret && !datasync)
105 /* application has asked for meta-data sync */
106 ret = pnfs_layoutcommit_inode(inode, true);
107 mutex_unlock(&inode->i_mutex);
108 /*
109 * If nfs_file_fsync_commit detected a server reboot, then
110 * resend all dirty pages that might have been covered by
111 * the NFS_CONTEXT_RESEND_WRITES flag
112 */
113 start = 0;
114 end = LLONG_MAX;
115 } while (ret == -EAGAIN);
105 116
106 return ret; 117 return ret;
107} 118}
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 53f94d915bd1..52d847212066 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -190,8 +190,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
190 * i/o and all i/o waiting on the slot table to the MDS until 190 * i/o and all i/o waiting on the slot table to the MDS until
191 * layout is destroyed and a new valid layout is obtained. 191 * layout is destroyed and a new valid layout is obtained.
192 */ 192 */
193 set_bit(NFS_LAYOUT_INVALID,
194 &NFS_I(inode)->layout->plh_flags);
195 pnfs_destroy_layout(NFS_I(inode)); 193 pnfs_destroy_layout(NFS_I(inode));
196 rpc_wake_up(&tbl->slot_tbl_waitq); 194 rpc_wake_up(&tbl->slot_tbl_waitq);
197 goto reset; 195 goto reset;
@@ -205,7 +203,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
205 case -EPIPE: 203 case -EPIPE:
206 dprintk("%s DS connection error %d\n", __func__, 204 dprintk("%s DS connection error %d\n", __func__,
207 task->tk_status); 205 task->tk_status);
208 filelayout_mark_devid_invalid(devid); 206 nfs4_mark_deviceid_unavailable(devid);
209 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); 207 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags);
210 _pnfs_return_layout(inode); 208 _pnfs_return_layout(inode);
211 rpc_wake_up(&tbl->slot_tbl_waitq); 209 rpc_wake_up(&tbl->slot_tbl_waitq);
@@ -269,6 +267,21 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
269 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 267 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
270} 268}
271 269
270bool
271filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
272{
273 return filelayout_test_devid_invalid(node) ||
274 nfs4_test_deviceid_unavailable(node);
275}
276
277static bool
278filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
279{
280 struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
281
282 return filelayout_test_devid_unavailable(node);
283}
284
272/* 285/*
273 * Call ops for the async read/write cases 286 * Call ops for the async read/write cases
274 * In the case of dense layouts, the offset needs to be reset to its 287 * In the case of dense layouts, the offset needs to be reset to its
@@ -453,7 +466,7 @@ static void filelayout_commit_release(void *calldata)
453 struct nfs_commit_data *data = calldata; 466 struct nfs_commit_data *data = calldata;
454 467
455 data->completion_ops->completion(data); 468 data->completion_ops->completion(data);
456 put_lseg(data->lseg); 469 pnfs_put_lseg(data->lseg);
457 nfs_put_client(data->ds_clp); 470 nfs_put_client(data->ds_clp);
458 nfs_commitdata_release(data); 471 nfs_commitdata_release(data);
459} 472}
@@ -608,13 +621,13 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
608 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 621 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
609 NFS_SERVER(lo->plh_inode)->nfs_client, id); 622 NFS_SERVER(lo->plh_inode)->nfs_client, id);
610 if (d == NULL) { 623 if (d == NULL) {
611 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); 624 dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
612 if (dsaddr == NULL) 625 if (dsaddr == NULL)
613 goto out; 626 goto out;
614 } else 627 } else
615 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); 628 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
616 /* Found deviceid is being reaped */ 629 /* Found deviceid is unavailable */
617 if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) 630 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
618 goto out_put; 631 goto out_put;
619 632
620 fl->dsaddr = dsaddr; 633 fl->dsaddr = dsaddr;
@@ -931,7 +944,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
931 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); 944 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
932 status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); 945 status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
933 if (status < 0) { 946 if (status < 0) {
934 put_lseg(pgio->pg_lseg); 947 pnfs_put_lseg(pgio->pg_lseg);
935 pgio->pg_lseg = NULL; 948 pgio->pg_lseg = NULL;
936 goto out_mds; 949 goto out_mds;
937 } 950 }
@@ -985,7 +998,7 @@ filelayout_clear_request_commit(struct nfs_page *req,
985out: 998out:
986 nfs_request_remove_commit_list(req, cinfo); 999 nfs_request_remove_commit_list(req, cinfo);
987 spin_unlock(cinfo->lock); 1000 spin_unlock(cinfo->lock);
988 put_lseg(freeme); 1001 pnfs_put_lseg(freeme);
989} 1002}
990 1003
991static struct list_head * 1004static struct list_head *
@@ -1018,7 +1031,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
1018 * off due to a rewrite, in which case it will be done in 1031 * off due to a rewrite, in which case it will be done in
1019 * filelayout_clear_request_commit 1032 * filelayout_clear_request_commit
1020 */ 1033 */
1021 buckets[i].wlseg = get_lseg(lseg); 1034 buckets[i].wlseg = pnfs_get_lseg(lseg);
1022 } 1035 }
1023 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 1036 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1024 cinfo->ds->nwritten++; 1037 cinfo->ds->nwritten++;
@@ -1128,7 +1141,7 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1128 if (list_empty(src)) 1141 if (list_empty(src))
1129 bucket->wlseg = NULL; 1142 bucket->wlseg = NULL;
1130 else 1143 else
1131 get_lseg(bucket->clseg); 1144 pnfs_get_lseg(bucket->clseg);
1132 } 1145 }
1133 return ret; 1146 return ret;
1134} 1147}
@@ -1159,12 +1172,12 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
1159 1172
1160 /* NOTE cinfo->lock is NOT held, relying on fact that this is 1173 /* NOTE cinfo->lock is NOT held, relying on fact that this is
1161 * only called on single thread per dreq. 1174 * only called on single thread per dreq.
1162 * Can't take the lock because need to do put_lseg 1175 * Can't take the lock because need to do pnfs_put_lseg
1163 */ 1176 */
1164 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 1177 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1165 if (transfer_commit_list(&b->written, dst, cinfo, 0)) { 1178 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1166 BUG_ON(!list_empty(&b->written)); 1179 BUG_ON(!list_empty(&b->written));
1167 put_lseg(b->wlseg); 1180 pnfs_put_lseg(b->wlseg);
1168 b->wlseg = NULL; 1181 b->wlseg = NULL;
1169 } 1182 }
1170 } 1183 }
@@ -1200,7 +1213,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1200 if (list_empty(&bucket->committing)) 1213 if (list_empty(&bucket->committing))
1201 continue; 1214 continue;
1202 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); 1215 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1203 put_lseg(bucket->clseg); 1216 pnfs_put_lseg(bucket->clseg);
1204 bucket->clseg = NULL; 1217 bucket->clseg = NULL;
1205 } 1218 }
1206 /* Caller will clean up entries put on list */ 1219 /* Caller will clean up entries put on list */
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 43fe802dd678..dca47d786710 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -129,23 +129,13 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
129} 129}
130 130
131static inline bool 131static inline bool
132filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo)
133{
134 return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags);
135}
136
137static inline bool
138filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) 132filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
139{ 133{
140 return test_bit(NFS_DEVICEID_INVALID, &node->flags); 134 return test_bit(NFS_DEVICEID_INVALID, &node->flags);
141} 135}
142 136
143static inline bool 137extern bool
144filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) 138filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
145{
146 return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) ||
147 filelayout_test_layout_invalid(lseg->pls_layout);
148}
149 139
150extern struct nfs_fh * 140extern struct nfs_fh *
151nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 141nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
@@ -158,7 +148,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
158extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 148extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
159extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 149extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
160struct nfs4_file_layout_dsaddr * 150struct nfs4_file_layout_dsaddr *
161get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 151filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
162void nfs4_ds_disconnect(struct nfs_client *clp); 152void nfs4_ds_disconnect(struct nfs_client *clp);
163 153
164#endif /* FS_NFS_NFS4FILELAYOUT_H */ 154#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f81231f30d94..3336d5eaf879 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -690,7 +690,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
690 * of available devices, and return it. 690 * of available devices, and return it.
691 */ 691 */
692struct nfs4_file_layout_dsaddr * 692struct nfs4_file_layout_dsaddr *
693get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) 693filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
694{ 694{
695 struct pnfs_device *pdev = NULL; 695 struct pnfs_device *pdev = NULL;
696 u32 max_resp_sz; 696 u32 max_resp_sz;
@@ -804,13 +804,14 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
804 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 804 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
805 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 805 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
806 806
807 if (filelayout_test_devid_invalid(devid)) 807 if (filelayout_test_devid_unavailable(devid))
808 return NULL; 808 return NULL;
809 809
810 if (ds == NULL) { 810 if (ds == NULL) {
811 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 811 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
812 __func__, ds_idx); 812 __func__, ds_idx);
813 goto mark_dev_invalid; 813 filelayout_mark_devid_invalid(devid);
814 return NULL;
814 } 815 }
815 816
816 if (!ds->ds_clp) { 817 if (!ds->ds_clp) {
@@ -818,14 +819,12 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
818 int err; 819 int err;
819 820
820 err = nfs4_ds_connect(s, ds); 821 err = nfs4_ds_connect(s, ds);
821 if (err) 822 if (err) {
822 goto mark_dev_invalid; 823 nfs4_mark_deviceid_unavailable(devid);
824 return NULL;
825 }
823 } 826 }
824 return ds; 827 return ds;
825
826mark_dev_invalid:
827 filelayout_mark_devid_invalid(devid);
828 return NULL;
829} 828}
830 829
831module_param(dataserver_retrans, uint, 0644); 830module_param(dataserver_retrans, uint, 0644);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 017b4b01a69c..79fbb61ce202 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -192,25 +192,13 @@ out:
192struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, 192struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
193 struct qstr *name) 193 struct qstr *name)
194{ 194{
195 struct rpc_clnt *clone;
196 struct rpc_auth *auth;
197 rpc_authflavor_t flavor; 195 rpc_authflavor_t flavor;
198 196
199 flavor = nfs4_negotiate_security(inode, name); 197 flavor = nfs4_negotiate_security(inode, name);
200 if ((int)flavor < 0) 198 if ((int)flavor < 0)
201 return ERR_PTR(flavor); 199 return ERR_PTR((int)flavor);
202 200
203 clone = rpc_clone_client(clnt); 201 return rpc_clone_client_set_auth(clnt, flavor);
204 if (IS_ERR(clone))
205 return clone;
206
207 auth = rpcauth_create(flavor, clone);
208 if (!auth) {
209 rpc_shutdown_client(clone);
210 clone = ERR_PTR(-EIO);
211 }
212
213 return clone;
214} 202}
215 203
216static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 204static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 635274140b18..68b21d81b7ac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -104,6 +104,8 @@ static int nfs4_map_errors(int err)
104 return -EACCES; 104 return -EACCES;
105 case -NFS4ERR_MINOR_VERS_MISMATCH: 105 case -NFS4ERR_MINOR_VERS_MISMATCH:
106 return -EPROTONOSUPPORT; 106 return -EPROTONOSUPPORT;
107 case -NFS4ERR_ACCESS:
108 return -EACCES;
107 default: 109 default:
108 dprintk("%s could not handle NFSv4 error %d\n", 110 dprintk("%s could not handle NFSv4 error %d\n",
109 __func__, -err); 111 __func__, -err);
@@ -150,6 +152,12 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
150 FATTR4_WORD2_MDSTHRESHOLD 152 FATTR4_WORD2_MDSTHRESHOLD
151}; 153};
152 154
155static const u32 nfs4_open_noattr_bitmap[3] = {
156 FATTR4_WORD0_TYPE
157 | FATTR4_WORD0_CHANGE
158 | FATTR4_WORD0_FILEID,
159};
160
153const u32 nfs4_statfs_bitmap[2] = { 161const u32 nfs4_statfs_bitmap[2] = {
154 FATTR4_WORD0_FILES_AVAIL 162 FATTR4_WORD0_FILES_AVAIL
155 | FATTR4_WORD0_FILES_FREE 163 | FATTR4_WORD0_FILES_FREE
@@ -832,6 +840,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
832 p->o_res.seqid = p->o_arg.seqid; 840 p->o_res.seqid = p->o_arg.seqid;
833 p->c_res.seqid = p->c_arg.seqid; 841 p->c_res.seqid = p->c_arg.seqid;
834 p->o_res.server = p->o_arg.server; 842 p->o_res.server = p->o_arg.server;
843 p->o_res.access_request = p->o_arg.access;
835 nfs_fattr_init(&p->f_attr); 844 nfs_fattr_init(&p->f_attr);
836 nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); 845 nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
837} 846}
@@ -860,6 +869,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
860 p->o_arg.fh = NFS_FH(dir); 869 p->o_arg.fh = NFS_FH(dir);
861 p->o_arg.open_flags = flags; 870 p->o_arg.open_flags = flags;
862 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 871 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
872 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
873 * will return permission denied for all bits until close */
874 if (!(flags & O_EXCL)) {
875 /* ask server to check for all possible rights as results
876 * are cached */
877 p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
878 NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
879 }
863 p->o_arg.clientid = server->nfs_client->cl_clientid; 880 p->o_arg.clientid = server->nfs_client->cl_clientid;
864 p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); 881 p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
865 p->o_arg.id.uniquifier = sp->so_seqid.owner_id; 882 p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
@@ -1115,11 +1132,80 @@ out_return_state:
1115 return state; 1132 return state;
1116} 1133}
1117 1134
1118static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) 1135static void
1136nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
1137{
1138 struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
1139 struct nfs_delegation *delegation;
1140 int delegation_flags = 0;
1141
1142 rcu_read_lock();
1143 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
1144 if (delegation)
1145 delegation_flags = delegation->flags;
1146 rcu_read_unlock();
1147 if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
1148 pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
1149 "returning a delegation for "
1150 "OPEN(CLAIM_DELEGATE_CUR)\n",
1151 clp->cl_hostname);
1152 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
1153 nfs_inode_set_delegation(state->inode,
1154 data->owner->so_cred,
1155 &data->o_res);
1156 else
1157 nfs_inode_reclaim_delegation(state->inode,
1158 data->owner->so_cred,
1159 &data->o_res);
1160}
1161
1162/*
1163 * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
1164 * and update the nfs4_state.
1165 */
1166static struct nfs4_state *
1167_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
1168{
1169 struct inode *inode = data->state->inode;
1170 struct nfs4_state *state = data->state;
1171 int ret;
1172
1173 if (!data->rpc_done) {
1174 ret = data->rpc_status;
1175 goto err;
1176 }
1177
1178 ret = -ESTALE;
1179 if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
1180 !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
1181 !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
1182 goto err;
1183
1184 ret = -ENOMEM;
1185 state = nfs4_get_open_state(inode, data->owner);
1186 if (state == NULL)
1187 goto err;
1188
1189 ret = nfs_refresh_inode(inode, &data->f_attr);
1190 if (ret)
1191 goto err;
1192
1193 if (data->o_res.delegation_type != 0)
1194 nfs4_opendata_check_deleg(data, state);
1195 update_open_stateid(state, &data->o_res.stateid, NULL,
1196 data->o_arg.fmode);
1197
1198 return state;
1199err:
1200 return ERR_PTR(ret);
1201
1202}
1203
1204static struct nfs4_state *
1205_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1119{ 1206{
1120 struct inode *inode; 1207 struct inode *inode;
1121 struct nfs4_state *state = NULL; 1208 struct nfs4_state *state = NULL;
1122 struct nfs_delegation *delegation;
1123 int ret; 1209 int ret;
1124 1210
1125 if (!data->rpc_done) { 1211 if (!data->rpc_done) {
@@ -1138,30 +1224,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
1138 state = nfs4_get_open_state(inode, data->owner); 1224 state = nfs4_get_open_state(inode, data->owner);
1139 if (state == NULL) 1225 if (state == NULL)
1140 goto err_put_inode; 1226 goto err_put_inode;
1141 if (data->o_res.delegation_type != 0) { 1227 if (data->o_res.delegation_type != 0)
1142 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 1228 nfs4_opendata_check_deleg(data, state);
1143 int delegation_flags = 0;
1144
1145 rcu_read_lock();
1146 delegation = rcu_dereference(NFS_I(inode)->delegation);
1147 if (delegation)
1148 delegation_flags = delegation->flags;
1149 rcu_read_unlock();
1150 if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
1151 pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
1152 "returning a delegation for "
1153 "OPEN(CLAIM_DELEGATE_CUR)\n",
1154 clp->cl_hostname);
1155 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
1156 nfs_inode_set_delegation(state->inode,
1157 data->owner->so_cred,
1158 &data->o_res);
1159 else
1160 nfs_inode_reclaim_delegation(state->inode,
1161 data->owner->so_cred,
1162 &data->o_res);
1163 }
1164
1165 update_open_stateid(state, &data->o_res.stateid, NULL, 1229 update_open_stateid(state, &data->o_res.stateid, NULL,
1166 data->o_arg.fmode); 1230 data->o_arg.fmode);
1167 iput(inode); 1231 iput(inode);
@@ -1173,6 +1237,14 @@ err:
1173 return ERR_PTR(ret); 1237 return ERR_PTR(ret);
1174} 1238}
1175 1239
1240static struct nfs4_state *
1241nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1242{
1243 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
1244 return _nfs4_opendata_reclaim_to_nfs4_state(data);
1245 return _nfs4_opendata_to_nfs4_state(data);
1246}
1247
1176static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) 1248static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
1177{ 1249{
1178 struct nfs_inode *nfsi = NFS_I(state->inode); 1250 struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -1494,6 +1566,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1494 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; 1566 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1495 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1567 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1496 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1568 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
1569 data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
1497 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1570 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
1498 } 1571 }
1499 data->timestamp = jiffies; 1572 data->timestamp = jiffies;
@@ -1526,7 +1599,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1526 return; 1599 return;
1527 1600
1528 if (task->tk_status == 0) { 1601 if (task->tk_status == 0) {
1529 switch (data->o_res.f_attr->mode & S_IFMT) { 1602 if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
1603 switch (data->o_res.f_attr->mode & S_IFMT) {
1530 case S_IFREG: 1604 case S_IFREG:
1531 break; 1605 break;
1532 case S_IFLNK: 1606 case S_IFLNK:
@@ -1537,6 +1611,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1537 break; 1611 break;
1538 default: 1612 default:
1539 data->rpc_status = -ENOTDIR; 1613 data->rpc_status = -ENOTDIR;
1614 }
1540 } 1615 }
1541 renew_lease(data->o_res.server, data->timestamp); 1616 renew_lease(data->o_res.server, data->timestamp);
1542 if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) 1617 if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
@@ -1643,6 +1718,39 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1643 return status; 1718 return status;
1644} 1719}
1645 1720
1721static int nfs4_opendata_access(struct rpc_cred *cred,
1722 struct nfs4_opendata *opendata,
1723 struct nfs4_state *state, fmode_t fmode)
1724{
1725 struct nfs_access_entry cache;
1726 u32 mask;
1727
1728 /* access call failed or for some reason the server doesn't
1729 * support any access modes -- defer access call until later */
1730 if (opendata->o_res.access_supported == 0)
1731 return 0;
1732
1733 mask = 0;
1734 /* don't check MAY_WRITE - a newly created file may not have
1735 * write mode bits, but POSIX allows the creating process to write */
1736 if (fmode & FMODE_READ)
1737 mask |= MAY_READ;
1738 if (fmode & FMODE_EXEC)
1739 mask |= MAY_EXEC;
1740
1741 cache.cred = cred;
1742 cache.jiffies = jiffies;
1743 nfs_access_set_mask(&cache, opendata->o_res.access_result);
1744 nfs_access_add_cache(state->inode, &cache);
1745
1746 if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0)
1747 return 0;
1748
1749 /* even though OPEN succeeded, access is denied. Close the file */
1750 nfs4_close_state(state, fmode);
1751 return -NFS4ERR_ACCESS;
1752}
1753
1646/* 1754/*
1647 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata 1755 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
1648 */ 1756 */
@@ -1774,7 +1882,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1774 * informs us the stateid is unrecognized. */ 1882 * informs us the stateid is unrecognized. */
1775 if (status != -NFS4ERR_BAD_STATEID) 1883 if (status != -NFS4ERR_BAD_STATEID)
1776 nfs41_free_stateid(server, stateid); 1884 nfs41_free_stateid(server, stateid);
1885 nfs_remove_bad_delegation(state->inode);
1777 1886
1887 write_seqlock(&state->seqlock);
1888 nfs4_stateid_copy(&state->stateid, &state->open_stateid);
1889 write_sequnlock(&state->seqlock);
1778 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1890 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1779 } 1891 }
1780} 1892}
@@ -1790,7 +1902,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1790static int nfs41_check_open_stateid(struct nfs4_state *state) 1902static int nfs41_check_open_stateid(struct nfs4_state *state)
1791{ 1903{
1792 struct nfs_server *server = NFS_SERVER(state->inode); 1904 struct nfs_server *server = NFS_SERVER(state->inode);
1793 nfs4_stateid *stateid = &state->stateid; 1905 nfs4_stateid *stateid = &state->open_stateid;
1794 int status; 1906 int status;
1795 1907
1796 /* If a state reset has been done, test_stateid is unneeded */ 1908 /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,6 +2008,10 @@ static int _nfs4_do_open(struct inode *dir,
1896 if (server->caps & NFS_CAP_POSIX_LOCK) 2008 if (server->caps & NFS_CAP_POSIX_LOCK)
1897 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 2009 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1898 2010
2011 status = nfs4_opendata_access(cred, opendata, state, fmode);
2012 if (status != 0)
2013 goto err_opendata_put;
2014
1899 if (opendata->o_arg.open_flags & O_EXCL) { 2015 if (opendata->o_arg.open_flags & O_EXCL) {
1900 nfs4_exclusive_attrset(opendata, sattr); 2016 nfs4_exclusive_attrset(opendata, sattr);
1901 2017
@@ -1941,7 +2057,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
1941 struct nfs4_state *res; 2057 struct nfs4_state *res;
1942 int status; 2058 int status;
1943 2059
1944 fmode &= FMODE_READ|FMODE_WRITE; 2060 fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
1945 do { 2061 do {
1946 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, 2062 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
1947 &res, ctx_th); 2063 &res, ctx_th);
@@ -2013,8 +2129,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2013 nfs_fattr_init(fattr); 2129 nfs_fattr_init(fattr);
2014 2130
2015 if (state != NULL) { 2131 if (state != NULL) {
2132 struct nfs_lockowner lockowner = {
2133 .l_owner = current->files,
2134 .l_pid = current->tgid,
2135 };
2016 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, 2136 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
2017 current->files, current->tgid); 2137 &lockowner);
2018 } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, 2138 } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
2019 FMODE_WRITE)) { 2139 FMODE_WRITE)) {
2020 /* Use that stateid */ 2140 /* Use that stateid */
@@ -2133,6 +2253,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2133{ 2253{
2134 struct nfs4_closedata *calldata = data; 2254 struct nfs4_closedata *calldata = data;
2135 struct nfs4_state *state = calldata->state; 2255 struct nfs4_state *state = calldata->state;
2256 struct inode *inode = calldata->inode;
2136 int call_close = 0; 2257 int call_close = 0;
2137 2258
2138 dprintk("%s: begin!\n", __func__); 2259 dprintk("%s: begin!\n", __func__);
@@ -2166,16 +2287,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2166 if (calldata->arg.fmode == 0) { 2287 if (calldata->arg.fmode == 0) {
2167 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 2288 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
2168 if (calldata->roc && 2289 if (calldata->roc &&
2169 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { 2290 pnfs_roc_drain(inode, &calldata->roc_barrier, task))
2170 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
2171 task, NULL);
2172 goto out; 2291 goto out;
2173 }
2174 } 2292 }
2175 2293
2176 nfs_fattr_init(calldata->res.fattr); 2294 nfs_fattr_init(calldata->res.fattr);
2177 calldata->timestamp = jiffies; 2295 calldata->timestamp = jiffies;
2178 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), 2296 if (nfs4_setup_sequence(NFS_SERVER(inode),
2179 &calldata->arg.seq_args, 2297 &calldata->arg.seq_args,
2180 &calldata->res.seq_res, 2298 &calldata->res.seq_res,
2181 task)) 2299 task))
@@ -2202,7 +2320,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
2202 * 2320 *
2203 * NOTE: Caller must be holding the sp->so_owner semaphore! 2321 * NOTE: Caller must be holding the sp->so_owner semaphore!
2204 */ 2322 */
2205int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) 2323int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2206{ 2324{
2207 struct nfs_server *server = NFS_SERVER(state->inode); 2325 struct nfs_server *server = NFS_SERVER(state->inode);
2208 struct nfs4_closedata *calldata; 2326 struct nfs4_closedata *calldata;
@@ -2238,7 +2356,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2238 calldata->res.fattr = &calldata->fattr; 2356 calldata->res.fattr = &calldata->fattr;
2239 calldata->res.seqid = calldata->arg.seqid; 2357 calldata->res.seqid = calldata->arg.seqid;
2240 calldata->res.server = server; 2358 calldata->res.server = server;
2241 calldata->roc = roc; 2359 calldata->roc = pnfs_roc(state->inode);
2242 nfs_sb_active(calldata->inode->i_sb); 2360 nfs_sb_active(calldata->inode->i_sb);
2243 2361
2244 msg.rpc_argp = &calldata->arg; 2362 msg.rpc_argp = &calldata->arg;
@@ -2255,8 +2373,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2255out_free_calldata: 2373out_free_calldata:
2256 kfree(calldata); 2374 kfree(calldata);
2257out: 2375out:
2258 if (roc)
2259 pnfs_roc_release(state->inode);
2260 nfs4_put_open_state(state); 2376 nfs4_put_open_state(state);
2261 nfs4_put_state_owner(sp); 2377 nfs4_put_state_owner(sp);
2262 return status; 2378 return status;
@@ -2399,7 +2515,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
2399 int ret; 2515 int ret;
2400 2516
2401 auth = rpcauth_create(flavor, server->client); 2517 auth = rpcauth_create(flavor, server->client);
2402 if (!auth) { 2518 if (IS_ERR(auth)) {
2403 ret = -EIO; 2519 ret = -EIO;
2404 goto out; 2520 goto out;
2405 } 2521 }
@@ -2767,13 +2883,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2767 2883
2768 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2884 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2769 if (!status) { 2885 if (!status) {
2770 entry->mask = 0; 2886 nfs_access_set_mask(entry, res.access);
2771 if (res.access & NFS4_ACCESS_READ)
2772 entry->mask |= MAY_READ;
2773 if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
2774 entry->mask |= MAY_WRITE;
2775 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2776 entry->mask |= MAY_EXEC;
2777 nfs_refresh_inode(inode, res.fattr); 2887 nfs_refresh_inode(inode, res.fattr);
2778 } 2888 }
2779 nfs_free_fattr(res.fattr); 2889 nfs_free_fattr(res.fattr);
@@ -3215,11 +3325,11 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
3215 dentry->d_parent->d_name.name, 3325 dentry->d_parent->d_name.name,
3216 dentry->d_name.name, 3326 dentry->d_name.name,
3217 (unsigned long long)cookie); 3327 (unsigned long long)cookie);
3218 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 3328 nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
3219 res.pgbase = args.pgbase; 3329 res.pgbase = args.pgbase;
3220 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); 3330 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
3221 if (status >= 0) { 3331 if (status >= 0) {
3222 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 3332 memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
3223 status += args.pgbase; 3333 status += args.pgbase;
3224 } 3334 }
3225 3335
@@ -3362,8 +3472,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
3362 3472
3363 nfs_fattr_init(fsinfo->fattr); 3473 nfs_fattr_init(fsinfo->fattr);
3364 error = nfs4_do_fsinfo(server, fhandle, fsinfo); 3474 error = nfs4_do_fsinfo(server, fhandle, fsinfo);
3365 if (error == 0) 3475 if (error == 0) {
3476 /* block layout checks this! */
3477 server->pnfs_blksize = fsinfo->blksize;
3366 set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); 3478 set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
3479 }
3367 3480
3368 return error; 3481 return error;
3369} 3482}
@@ -3653,11 +3766,11 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
3653 && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); 3766 && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
3654} 3767}
3655 3768
3656/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that 3769/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
3657 * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on 3770 * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on
3658 * the stack. 3771 * the stack.
3659 */ 3772 */
3660#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) 3773#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
3661 3774
3662static int buf_to_pages_noslab(const void *buf, size_t buflen, 3775static int buf_to_pages_noslab(const void *buf, size_t buflen,
3663 struct page **pages, unsigned int *pgbase) 3776 struct page **pages, unsigned int *pgbase)
@@ -3668,7 +3781,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen,
3668 spages = pages; 3781 spages = pages;
3669 3782
3670 do { 3783 do {
3671 len = min_t(size_t, PAGE_CACHE_SIZE, buflen); 3784 len = min_t(size_t, PAGE_SIZE, buflen);
3672 newpage = alloc_page(GFP_KERNEL); 3785 newpage = alloc_page(GFP_KERNEL);
3673 3786
3674 if (newpage == NULL) 3787 if (newpage == NULL)
@@ -3739,7 +3852,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size
3739 struct nfs4_cached_acl *acl; 3852 struct nfs4_cached_acl *acl;
3740 size_t buflen = sizeof(*acl) + acl_len; 3853 size_t buflen = sizeof(*acl) + acl_len;
3741 3854
3742 if (pages && buflen <= PAGE_SIZE) { 3855 if (buflen <= PAGE_SIZE) {
3743 acl = kmalloc(buflen, GFP_KERNEL); 3856 acl = kmalloc(buflen, GFP_KERNEL);
3744 if (acl == NULL) 3857 if (acl == NULL)
3745 goto out; 3858 goto out;
@@ -3782,17 +3895,15 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3782 .rpc_argp = &args, 3895 .rpc_argp = &args,
3783 .rpc_resp = &res, 3896 .rpc_resp = &res,
3784 }; 3897 };
3785 int ret = -ENOMEM, npages, i; 3898 unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
3786 size_t acl_len = 0; 3899 int ret = -ENOMEM, i;
3787 3900
3788 npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3789 /* As long as we're doing a round trip to the server anyway, 3901 /* As long as we're doing a round trip to the server anyway,
3790 * let's be prepared for a page of acl data. */ 3902 * let's be prepared for a page of acl data. */
3791 if (npages == 0) 3903 if (npages == 0)
3792 npages = 1; 3904 npages = 1;
3793 3905 if (npages > ARRAY_SIZE(pages))
3794 /* Add an extra page to handle the bitmap returned */ 3906 return -ERANGE;
3795 npages++;
3796 3907
3797 for (i = 0; i < npages; i++) { 3908 for (i = 0; i < npages; i++) {
3798 pages[i] = alloc_page(GFP_KERNEL); 3909 pages[i] = alloc_page(GFP_KERNEL);
@@ -3808,11 +3919,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3808 args.acl_len = npages * PAGE_SIZE; 3919 args.acl_len = npages * PAGE_SIZE;
3809 args.acl_pgbase = 0; 3920 args.acl_pgbase = 0;
3810 3921
3811 /* Let decode_getfacl know not to fail if the ACL data is larger than
3812 * the page we send as a guess */
3813 if (buf == NULL)
3814 res.acl_flags |= NFS4_ACL_LEN_REQUEST;
3815
3816 dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", 3922 dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
3817 __func__, buf, buflen, npages, args.acl_len); 3923 __func__, buf, buflen, npages, args.acl_len);
3818 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), 3924 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
@@ -3820,20 +3926,19 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3820 if (ret) 3926 if (ret)
3821 goto out_free; 3927 goto out_free;
3822 3928
3823 acl_len = res.acl_len; 3929 /* Handle the case where the passed-in buffer is too short */
3824 if (acl_len > args.acl_len) 3930 if (res.acl_flags & NFS4_ACL_TRUNC) {
3825 nfs4_write_cached_acl(inode, NULL, 0, acl_len); 3931 /* Did the user only issue a request for the acl length? */
3826 else 3932 if (buf == NULL)
3827 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, 3933 goto out_ok;
3828 acl_len);
3829 if (buf) {
3830 ret = -ERANGE; 3934 ret = -ERANGE;
3831 if (acl_len > buflen) 3935 goto out_free;
3832 goto out_free;
3833 _copy_from_pages(buf, pages, res.acl_data_offset,
3834 acl_len);
3835 } 3936 }
3836 ret = acl_len; 3937 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
3938 if (buf)
3939 _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
3940out_ok:
3941 ret = res.acl_len;
3837out_free: 3942out_free:
3838 for (i = 0; i < npages; i++) 3943 for (i = 0; i < npages; i++)
3839 if (pages[i]) 3944 if (pages[i])
@@ -3891,10 +3996,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3891 .rpc_argp = &arg, 3996 .rpc_argp = &arg,
3892 .rpc_resp = &res, 3997 .rpc_resp = &res,
3893 }; 3998 };
3999 unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
3894 int ret, i; 4000 int ret, i;
3895 4001
3896 if (!nfs4_server_supports_acls(server)) 4002 if (!nfs4_server_supports_acls(server))
3897 return -EOPNOTSUPP; 4003 return -EOPNOTSUPP;
4004 if (npages > ARRAY_SIZE(pages))
4005 return -ERANGE;
3898 i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 4006 i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3899 if (i < 0) 4007 if (i < 0)
3900 return i; 4008 return i;
@@ -4012,6 +4120,36 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
4012 memcpy(bootverf->data, verf, sizeof(bootverf->data)); 4120 memcpy(bootverf->data, verf, sizeof(bootverf->data));
4013} 4121}
4014 4122
4123static unsigned int
4124nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
4125 char *buf, size_t len)
4126{
4127 unsigned int result;
4128
4129 rcu_read_lock();
4130 result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
4131 clp->cl_ipaddr,
4132 rpc_peeraddr2str(clp->cl_rpcclient,
4133 RPC_DISPLAY_ADDR),
4134 rpc_peeraddr2str(clp->cl_rpcclient,
4135 RPC_DISPLAY_PROTO));
4136 rcu_read_unlock();
4137 return result;
4138}
4139
4140static unsigned int
4141nfs4_init_uniform_client_string(const struct nfs_client *clp,
4142 char *buf, size_t len)
4143{
4144 char *nodename = clp->cl_rpcclient->cl_nodename;
4145
4146 if (nfs4_client_id_uniquifier[0] != '\0')
4147 nodename = nfs4_client_id_uniquifier;
4148 return scnprintf(buf, len, "Linux NFSv%u.%u %s",
4149 clp->rpc_ops->version, clp->cl_minorversion,
4150 nodename);
4151}
4152
4015/** 4153/**
4016 * nfs4_proc_setclientid - Negotiate client ID 4154 * nfs4_proc_setclientid - Negotiate client ID
4017 * @clp: state data structure 4155 * @clp: state data structure
@@ -4042,15 +4180,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4042 4180
4043 /* nfs_client_id4 */ 4181 /* nfs_client_id4 */
4044 nfs4_init_boot_verifier(clp, &sc_verifier); 4182 nfs4_init_boot_verifier(clp, &sc_verifier);
4045 rcu_read_lock(); 4183 if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
4046 setclientid.sc_name_len = scnprintf(setclientid.sc_name, 4184 setclientid.sc_name_len =
4047 sizeof(setclientid.sc_name), "%s/%s %s", 4185 nfs4_init_uniform_client_string(clp,
4048 clp->cl_ipaddr, 4186 setclientid.sc_name,
4049 rpc_peeraddr2str(clp->cl_rpcclient, 4187 sizeof(setclientid.sc_name));
4050 RPC_DISPLAY_ADDR), 4188 else
4051 rpc_peeraddr2str(clp->cl_rpcclient, 4189 setclientid.sc_name_len =
4052 RPC_DISPLAY_PROTO)); 4190 nfs4_init_nonuniform_client_string(clp,
4191 setclientid.sc_name,
4192 sizeof(setclientid.sc_name));
4053 /* cb_client4 */ 4193 /* cb_client4 */
4194 rcu_read_lock();
4054 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4195 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
4055 sizeof(setclientid.sc_netid), 4196 sizeof(setclientid.sc_netid),
4056 rpc_peeraddr2str(clp->cl_rpcclient, 4197 rpc_peeraddr2str(clp->cl_rpcclient,
@@ -4396,7 +4537,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4396 4537
4397 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 4538 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
4398 return; 4539 return;
4399 if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { 4540 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
4400 /* Note: exit _without_ running nfs4_locku_done */ 4541 /* Note: exit _without_ running nfs4_locku_done */
4401 task->tk_action = NULL; 4542 task->tk_action = NULL;
4402 return; 4543 return;
@@ -4590,7 +4731,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4590 } 4731 }
4591 if (data->rpc_status == 0) { 4732 if (data->rpc_status == 0) {
4592 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); 4733 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
4593 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; 4734 set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
4594 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 4735 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
4595 } 4736 }
4596out: 4737out:
@@ -4637,7 +4778,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
4637 case -NFS4ERR_BAD_STATEID: 4778 case -NFS4ERR_BAD_STATEID:
4638 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; 4779 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4639 if (new_lock_owner != 0 || 4780 if (new_lock_owner != 0 ||
4640 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 4781 test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
4641 nfs4_schedule_stateid_recovery(server, lsp->ls_state); 4782 nfs4_schedule_stateid_recovery(server, lsp->ls_state);
4642 break; 4783 break;
4643 case -NFS4ERR_STALE_STATEID: 4784 case -NFS4ERR_STALE_STATEID:
@@ -4761,7 +4902,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
4761 struct nfs_server *server = NFS_SERVER(state->inode); 4902 struct nfs_server *server = NFS_SERVER(state->inode);
4762 4903
4763 list_for_each_entry(lsp, &state->lock_states, ls_locks) { 4904 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
4764 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { 4905 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
4765 status = nfs41_test_stateid(server, &lsp->ls_stateid); 4906 status = nfs41_test_stateid(server, &lsp->ls_stateid);
4766 if (status != NFS_OK) { 4907 if (status != NFS_OK) {
4767 /* Free the stateid unless the server 4908 /* Free the stateid unless the server
@@ -4769,7 +4910,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
4769 if (status != -NFS4ERR_BAD_STATEID) 4910 if (status != -NFS4ERR_BAD_STATEID)
4770 nfs41_free_stateid(server, 4911 nfs41_free_stateid(server,
4771 &lsp->ls_stateid); 4912 &lsp->ls_stateid);
4772 lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; 4913 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
4773 ret = status; 4914 ret = status;
4774 } 4915 }
4775 } 4916 }
@@ -5272,10 +5413,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5272 }; 5413 };
5273 5414
5274 nfs4_init_boot_verifier(clp, &verifier); 5415 nfs4_init_boot_verifier(clp, &verifier);
5275 args.id_len = scnprintf(args.id, sizeof(args.id), 5416 args.id_len = nfs4_init_uniform_client_string(clp, args.id,
5276 "%s/%s", 5417 sizeof(args.id));
5277 clp->cl_ipaddr,
5278 clp->cl_rpcclient->cl_nodename);
5279 dprintk("NFS call exchange_id auth=%s, '%.*s'\n", 5418 dprintk("NFS call exchange_id auth=%s, '%.*s'\n",
5280 clp->cl_rpcclient->cl_auth->au_ops->au_name, 5419 clp->cl_rpcclient->cl_auth->au_ops->au_name,
5281 args.id_len, args.id); 5420 args.id_len, args.id);
@@ -5396,6 +5535,8 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
5396 goto out; 5535 goto out;
5397 if (clp->cl_exchange_flags == 0) 5536 if (clp->cl_exchange_flags == 0)
5398 goto out; 5537 goto out;
5538 if (clp->cl_preserve_clid)
5539 goto out;
5399 cred = nfs4_get_exchange_id_cred(clp); 5540 cred = nfs4_get_exchange_id_cred(clp);
5400 ret = nfs4_proc_destroy_clientid(clp, cred); 5541 ret = nfs4_proc_destroy_clientid(clp, cred);
5401 if (cred) 5542 if (cred)
@@ -6201,26 +6342,44 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6201static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 6342static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
6202{ 6343{
6203 struct nfs4_layoutget *lgp = calldata; 6344 struct nfs4_layoutget *lgp = calldata;
6204 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 6345 struct inode *inode = lgp->args.inode;
6346 struct nfs_server *server = NFS_SERVER(inode);
6347 struct pnfs_layout_hdr *lo;
6348 struct nfs4_state *state = NULL;
6205 6349
6206 dprintk("--> %s\n", __func__); 6350 dprintk("--> %s\n", __func__);
6207 6351
6208 if (!nfs4_sequence_done(task, &lgp->res.seq_res)) 6352 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
6209 return; 6353 goto out;
6210 6354
6211 switch (task->tk_status) { 6355 switch (task->tk_status) {
6212 case 0: 6356 case 0:
6213 break; 6357 goto out;
6214 case -NFS4ERR_LAYOUTTRYLATER: 6358 case -NFS4ERR_LAYOUTTRYLATER:
6215 case -NFS4ERR_RECALLCONFLICT: 6359 case -NFS4ERR_RECALLCONFLICT:
6216 task->tk_status = -NFS4ERR_DELAY; 6360 task->tk_status = -NFS4ERR_DELAY;
6217 /* Fall through */ 6361 break;
6218 default: 6362 case -NFS4ERR_EXPIRED:
6219 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 6363 case -NFS4ERR_BAD_STATEID:
6220 rpc_restart_call_prepare(task); 6364 spin_lock(&inode->i_lock);
6221 return; 6365 lo = NFS_I(inode)->layout;
6366 if (!lo || list_empty(&lo->plh_segs)) {
6367 spin_unlock(&inode->i_lock);
6368 /* If the open stateid was bad, then recover it. */
6369 state = lgp->args.ctx->state;
6370 } else {
6371 LIST_HEAD(head);
6372
6373 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
6374 spin_unlock(&inode->i_lock);
6375 /* Mark the bad layout state as invalid, then
6376 * retry using the open stateid. */
6377 pnfs_free_lseg_list(&head);
6222 } 6378 }
6223 } 6379 }
6380 if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
6381 rpc_restart_call_prepare(task);
6382out:
6224 dprintk("<-- %s\n", __func__); 6383 dprintk("<-- %s\n", __func__);
6225} 6384}
6226 6385
@@ -6287,7 +6446,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
6287 .rpc_release = nfs4_layoutget_release, 6446 .rpc_release = nfs4_layoutget_release,
6288}; 6447};
6289 6448
6290void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) 6449struct pnfs_layout_segment *
6450nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6291{ 6451{
6292 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 6452 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
6293 size_t max_pages = max_response_pages(server); 6453 size_t max_pages = max_response_pages(server);
@@ -6304,6 +6464,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6304 .callback_data = lgp, 6464 .callback_data = lgp,
6305 .flags = RPC_TASK_ASYNC, 6465 .flags = RPC_TASK_ASYNC,
6306 }; 6466 };
6467 struct pnfs_layout_segment *lseg = NULL;
6307 int status = 0; 6468 int status = 0;
6308 6469
6309 dprintk("--> %s\n", __func__); 6470 dprintk("--> %s\n", __func__);
@@ -6311,7 +6472,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6311 lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); 6472 lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
6312 if (!lgp->args.layout.pages) { 6473 if (!lgp->args.layout.pages) {
6313 nfs4_layoutget_release(lgp); 6474 nfs4_layoutget_release(lgp);
6314 return; 6475 return ERR_PTR(-ENOMEM);
6315 } 6476 }
6316 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 6477 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
6317 6478
@@ -6320,15 +6481,17 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6320 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); 6481 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
6321 task = rpc_run_task(&task_setup_data); 6482 task = rpc_run_task(&task_setup_data);
6322 if (IS_ERR(task)) 6483 if (IS_ERR(task))
6323 return; 6484 return ERR_CAST(task);
6324 status = nfs4_wait_for_completion_rpc_task(task); 6485 status = nfs4_wait_for_completion_rpc_task(task);
6325 if (status == 0) 6486 if (status == 0)
6326 status = task->tk_status; 6487 status = task->tk_status;
6327 if (status == 0) 6488 if (status == 0)
6328 status = pnfs_layout_process(lgp); 6489 lseg = pnfs_layout_process(lgp);
6329 rpc_put_task(task); 6490 rpc_put_task(task);
6330 dprintk("<-- %s status=%d\n", __func__, status); 6491 dprintk("<-- %s status=%d\n", __func__, status);
6331 return; 6492 if (status)
6493 return ERR_PTR(status);
6494 return lseg;
6332} 6495}
6333 6496
6334static void 6497static void
@@ -6347,7 +6510,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6347{ 6510{
6348 struct nfs4_layoutreturn *lrp = calldata; 6511 struct nfs4_layoutreturn *lrp = calldata;
6349 struct nfs_server *server; 6512 struct nfs_server *server;
6350 struct pnfs_layout_hdr *lo = lrp->args.layout;
6351 6513
6352 dprintk("--> %s\n", __func__); 6514 dprintk("--> %s\n", __func__);
6353 6515
@@ -6359,20 +6521,21 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6359 rpc_restart_call_prepare(task); 6521 rpc_restart_call_prepare(task);
6360 return; 6522 return;
6361 } 6523 }
6362 spin_lock(&lo->plh_inode->i_lock);
6363 if (task->tk_status == 0 && lrp->res.lrs_present)
6364 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
6365 lo->plh_block_lgets--;
6366 spin_unlock(&lo->plh_inode->i_lock);
6367 dprintk("<-- %s\n", __func__); 6524 dprintk("<-- %s\n", __func__);
6368} 6525}
6369 6526
6370static void nfs4_layoutreturn_release(void *calldata) 6527static void nfs4_layoutreturn_release(void *calldata)
6371{ 6528{
6372 struct nfs4_layoutreturn *lrp = calldata; 6529 struct nfs4_layoutreturn *lrp = calldata;
6530 struct pnfs_layout_hdr *lo = lrp->args.layout;
6373 6531
6374 dprintk("--> %s\n", __func__); 6532 dprintk("--> %s\n", __func__);
6375 put_layout_hdr(lrp->args.layout); 6533 spin_lock(&lo->plh_inode->i_lock);
6534 if (lrp->res.lrs_present)
6535 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
6536 lo->plh_block_lgets--;
6537 spin_unlock(&lo->plh_inode->i_lock);
6538 pnfs_put_layout_hdr(lrp->args.layout);
6376 kfree(calldata); 6539 kfree(calldata);
6377 dprintk("<-- %s\n", __func__); 6540 dprintk("<-- %s\n", __func__);
6378} 6541}
@@ -6546,7 +6709,7 @@ static void nfs4_layoutcommit_release(void *calldata)
6546 list_del_init(&lseg->pls_lc_list); 6709 list_del_init(&lseg->pls_lc_list);
6547 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, 6710 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
6548 &lseg->pls_flags)) 6711 &lseg->pls_flags))
6549 put_lseg(lseg); 6712 pnfs_put_lseg(lseg);
6550 } 6713 }
6551 6714
6552 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 6715 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
@@ -6805,6 +6968,7 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
6805 .recover_lock = nfs4_lock_reclaim, 6968 .recover_lock = nfs4_lock_reclaim,
6806 .establish_clid = nfs4_init_clientid, 6969 .establish_clid = nfs4_init_clientid,
6807 .get_clid_cred = nfs4_get_setclientid_cred, 6970 .get_clid_cred = nfs4_get_setclientid_cred,
6971 .detect_trunking = nfs40_discover_server_trunking,
6808}; 6972};
6809 6973
6810#if defined(CONFIG_NFS_V4_1) 6974#if defined(CONFIG_NFS_V4_1)
@@ -6816,6 +6980,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
6816 .establish_clid = nfs41_init_clientid, 6980 .establish_clid = nfs41_init_clientid,
6817 .get_clid_cred = nfs4_get_exchange_id_cred, 6981 .get_clid_cred = nfs4_get_exchange_id_cred,
6818 .reclaim_complete = nfs41_proc_reclaim_complete, 6982 .reclaim_complete = nfs41_proc_reclaim_complete,
6983 .detect_trunking = nfs41_discover_server_trunking,
6819}; 6984};
6820#endif /* CONFIG_NFS_V4_1 */ 6985#endif /* CONFIG_NFS_V4_1 */
6821 6986
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 6930bec91bca..1720d32ffa54 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -117,8 +117,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
117 timeout = 5 * HZ; 117 timeout = 5 * HZ;
118 dprintk("%s: requeueing work. Lease period = %ld\n", 118 dprintk("%s: requeueing work. Lease period = %ld\n",
119 __func__, (timeout + HZ - 1) / HZ); 119 __func__, (timeout + HZ - 1) / HZ);
120 cancel_delayed_work(&clp->cl_renewd); 120 mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
121 schedule_delayed_work(&clp->cl_renewd, timeout);
122 set_bit(NFS_CS_RENEWD, &clp->cl_res_state); 121 set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
123 spin_unlock(&clp->cl_lock); 122 spin_unlock(&clp->cl_lock);
124} 123}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 55148def5540..c351e6b39838 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -51,18 +51,21 @@
51#include <linux/bitops.h> 51#include <linux/bitops.h>
52#include <linux/jiffies.h> 52#include <linux/jiffies.h>
53 53
54#include <linux/sunrpc/clnt.h>
55
54#include "nfs4_fs.h" 56#include "nfs4_fs.h"
55#include "callback.h" 57#include "callback.h"
56#include "delegation.h" 58#include "delegation.h"
57#include "internal.h" 59#include "internal.h"
58#include "pnfs.h" 60#include "pnfs.h"
61#include "netns.h"
59 62
60#define NFSDBG_FACILITY NFSDBG_STATE 63#define NFSDBG_FACILITY NFSDBG_STATE
61 64
62#define OPENOWNER_POOL_SIZE 8 65#define OPENOWNER_POOL_SIZE 8
63 66
64const nfs4_stateid zero_stateid; 67const nfs4_stateid zero_stateid;
65 68static DEFINE_MUTEX(nfs_clid_init_mutex);
66static LIST_HEAD(nfs4_clientid_list); 69static LIST_HEAD(nfs4_clientid_list);
67 70
68int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -73,12 +76,13 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
73 }; 76 };
74 unsigned short port; 77 unsigned short port;
75 int status; 78 int status;
79 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
76 80
77 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) 81 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
78 goto do_confirm; 82 goto do_confirm;
79 port = nfs_callback_tcpport; 83 port = nn->nfs_callback_tcpport;
80 if (clp->cl_addr.ss_family == AF_INET6) 84 if (clp->cl_addr.ss_family == AF_INET6)
81 port = nfs_callback_tcpport6; 85 port = nn->nfs_callback_tcpport6;
82 86
83 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); 87 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
84 if (status != 0) 88 if (status != 0)
@@ -96,6 +100,56 @@ out:
96 return status; 100 return status;
97} 101}
98 102
103/**
104 * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
105 *
106 * @clp: nfs_client under test
107 * @result: OUT: found nfs_client, or clp
108 * @cred: credential to use for trunking test
109 *
110 * Returns zero, a negative errno, or a negative NFS4ERR status.
111 * If zero is returned, an nfs_client pointer is planted in
112 * "result".
113 *
114 * Note: The returned client may not yet be marked ready.
115 */
116int nfs40_discover_server_trunking(struct nfs_client *clp,
117 struct nfs_client **result,
118 struct rpc_cred *cred)
119{
120 struct nfs4_setclientid_res clid = {
121 .clientid = clp->cl_clientid,
122 .confirm = clp->cl_confirm,
123 };
124 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
125 unsigned short port;
126 int status;
127
128 port = nn->nfs_callback_tcpport;
129 if (clp->cl_addr.ss_family == AF_INET6)
130 port = nn->nfs_callback_tcpport6;
131
132 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
133 if (status != 0)
134 goto out;
135 clp->cl_clientid = clid.clientid;
136 clp->cl_confirm = clid.confirm;
137
138 status = nfs40_walk_client_list(clp, result, cred);
139 switch (status) {
140 case -NFS4ERR_STALE_CLIENTID:
141 set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
142 case 0:
143 /* Sustain the lease, even if it's empty. If the clientid4
144 * goes stale it's of no use for trunking discovery. */
145 nfs4_schedule_state_renewal(*result);
146 break;
147 }
148
149out:
150 return status;
151}
152
99struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) 153struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
100{ 154{
101 struct rpc_cred *cred = NULL; 155 struct rpc_cred *cred = NULL;
@@ -275,6 +329,33 @@ out:
275 return status; 329 return status;
276} 330}
277 331
332/**
333 * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
334 *
335 * @clp: nfs_client under test
336 * @result: OUT: found nfs_client, or clp
337 * @cred: credential to use for trunking test
338 *
339 * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
340 * If NFS4_OK is returned, an nfs_client pointer is planted in
341 * "result".
342 *
343 * Note: The returned client may not yet be marked ready.
344 */
345int nfs41_discover_server_trunking(struct nfs_client *clp,
346 struct nfs_client **result,
347 struct rpc_cred *cred)
348{
349 int status;
350
351 status = nfs4_proc_exchange_id(clp, cred);
352 if (status != NFS4_OK)
353 return status;
354 set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
355
356 return nfs41_walk_client_list(clp, result, cred);
357}
358
278struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) 359struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
279{ 360{
280 struct rpc_cred *cred; 361 struct rpc_cred *cred;
@@ -729,11 +810,8 @@ static void __nfs4_close(struct nfs4_state *state,
729 if (!call_close) { 810 if (!call_close) {
730 nfs4_put_open_state(state); 811 nfs4_put_open_state(state);
731 nfs4_put_state_owner(owner); 812 nfs4_put_state_owner(owner);
732 } else { 813 } else
733 bool roc = pnfs_roc(state->inode); 814 nfs4_do_close(state, gfp_mask, wait);
734
735 nfs4_do_close(state, gfp_mask, wait, roc);
736 }
737} 815}
738 816
739void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) 817void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
@@ -865,7 +943,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
865 if (list_empty(&state->lock_states)) 943 if (list_empty(&state->lock_states))
866 clear_bit(LK_STATE_IN_USE, &state->flags); 944 clear_bit(LK_STATE_IN_USE, &state->flags);
867 spin_unlock(&state->state_lock); 945 spin_unlock(&state->state_lock);
868 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { 946 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
869 if (nfs4_release_lockowner(lsp) == 0) 947 if (nfs4_release_lockowner(lsp) == 0)
870 return; 948 return;
871 } 949 }
@@ -911,17 +989,25 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
911} 989}
912 990
913static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, 991static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
914 fl_owner_t fl_owner, pid_t fl_pid) 992 const struct nfs_lockowner *lockowner)
915{ 993{
916 struct nfs4_lock_state *lsp; 994 struct nfs4_lock_state *lsp;
995 fl_owner_t fl_owner;
996 pid_t fl_pid;
917 bool ret = false; 997 bool ret = false;
918 998
999
1000 if (lockowner == NULL)
1001 goto out;
1002
919 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) 1003 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
920 goto out; 1004 goto out;
921 1005
1006 fl_owner = lockowner->l_owner;
1007 fl_pid = lockowner->l_pid;
922 spin_lock(&state->state_lock); 1008 spin_lock(&state->state_lock);
923 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 1009 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
924 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { 1010 if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
925 nfs4_stateid_copy(dst, &lsp->ls_stateid); 1011 nfs4_stateid_copy(dst, &lsp->ls_stateid);
926 ret = true; 1012 ret = true;
927 } 1013 }
@@ -946,11 +1032,11 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
946 * requests. 1032 * requests.
947 */ 1033 */
948void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, 1034void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
949 fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) 1035 fmode_t fmode, const struct nfs_lockowner *lockowner)
950{ 1036{
951 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) 1037 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
952 return; 1038 return;
953 if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) 1039 if (nfs4_copy_lock_stateid(dst, state, lockowner))
954 return; 1040 return;
955 nfs4_copy_open_stateid(dst, state); 1041 nfs4_copy_open_stateid(dst, state);
956} 1042}
@@ -1289,7 +1375,7 @@ restart:
1289 if (status >= 0) { 1375 if (status >= 0) {
1290 spin_lock(&state->state_lock); 1376 spin_lock(&state->state_lock);
1291 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1377 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1292 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) 1378 if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
1293 pr_warn_ratelimited("NFS: " 1379 pr_warn_ratelimited("NFS: "
1294 "%s: Lock reclaim " 1380 "%s: Lock reclaim "
1295 "failed!\n", __func__); 1381 "failed!\n", __func__);
@@ -1361,7 +1447,7 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
1361 spin_lock(&state->state_lock); 1447 spin_lock(&state->state_lock);
1362 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1448 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1363 lock->ls_seqid.flags = 0; 1449 lock->ls_seqid.flags = 0;
1364 lock->ls_flags &= ~NFS_LOCK_INITIALIZED; 1450 clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
1365 } 1451 }
1366 spin_unlock(&state->state_lock); 1452 spin_unlock(&state->state_lock);
1367} 1453}
@@ -1595,8 +1681,8 @@ out:
1595 return nfs4_recovery_handle_error(clp, status); 1681 return nfs4_recovery_handle_error(clp, status);
1596} 1682}
1597 1683
1598/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors 1684/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
1599 * on EXCHANGE_ID for v4.1 1685 * and for recoverable errors on EXCHANGE_ID for v4.1
1600 */ 1686 */
1601static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) 1687static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1602{ 1688{
@@ -1606,8 +1692,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1606 return -ESERVERFAULT; 1692 return -ESERVERFAULT;
1607 /* Lease confirmation error: retry after purging the lease */ 1693 /* Lease confirmation error: retry after purging the lease */
1608 ssleep(1); 1694 ssleep(1);
1695 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1696 break;
1609 case -NFS4ERR_STALE_CLIENTID: 1697 case -NFS4ERR_STALE_CLIENTID:
1610 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 1698 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1699 nfs4_state_clear_reclaim_reboot(clp);
1700 nfs4_state_start_reclaim_reboot(clp);
1611 break; 1701 break;
1612 case -NFS4ERR_CLID_INUSE: 1702 case -NFS4ERR_CLID_INUSE:
1613 pr_err("NFS: Server %s reports our clientid is in use\n", 1703 pr_err("NFS: Server %s reports our clientid is in use\n",
@@ -1698,6 +1788,109 @@ static int nfs4_purge_lease(struct nfs_client *clp)
1698 return 0; 1788 return 0;
1699} 1789}
1700 1790
1791/**
1792 * nfs4_discover_server_trunking - Detect server IP address trunking
1793 *
1794 * @clp: nfs_client under test
1795 * @result: OUT: found nfs_client, or clp
1796 *
1797 * Returns zero or a negative errno. If zero is returned,
1798 * an nfs_client pointer is planted in "result".
1799 *
1800 * Note: since we are invoked in process context, and
1801 * not from inside the state manager, we cannot use
1802 * nfs4_handle_reclaim_lease_error().
1803 */
1804int nfs4_discover_server_trunking(struct nfs_client *clp,
1805 struct nfs_client **result)
1806{
1807 const struct nfs4_state_recovery_ops *ops =
1808 clp->cl_mvops->reboot_recovery_ops;
1809 rpc_authflavor_t *flavors, flav, save;
1810 struct rpc_clnt *clnt;
1811 struct rpc_cred *cred;
1812 int i, len, status;
1813
1814 dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
1815
1816 len = NFS_MAX_SECFLAVORS;
1817 flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL);
1818 if (flavors == NULL) {
1819 status = -ENOMEM;
1820 goto out;
1821 }
1822 len = rpcauth_list_flavors(flavors, len);
1823 if (len < 0) {
1824 status = len;
1825 goto out_free;
1826 }
1827 clnt = clp->cl_rpcclient;
1828 save = clnt->cl_auth->au_flavor;
1829 i = 0;
1830
1831 mutex_lock(&nfs_clid_init_mutex);
1832 status = -ENOENT;
1833again:
1834 cred = ops->get_clid_cred(clp);
1835 if (cred == NULL)
1836 goto out_unlock;
1837
1838 status = ops->detect_trunking(clp, result, cred);
1839 put_rpccred(cred);
1840 switch (status) {
1841 case 0:
1842 break;
1843
1844 case -EACCES:
1845 if (clp->cl_machine_cred == NULL)
1846 break;
1847 /* Handle case where the user hasn't set up machine creds */
1848 nfs4_clear_machine_cred(clp);
1849 case -NFS4ERR_DELAY:
1850 case -ETIMEDOUT:
1851 case -EAGAIN:
1852 ssleep(1);
1853 dprintk("NFS: %s after status %d, retrying\n",
1854 __func__, status);
1855 goto again;
1856
1857 case -NFS4ERR_CLID_INUSE:
1858 case -NFS4ERR_WRONGSEC:
1859 status = -EPERM;
1860 if (i >= len)
1861 break;
1862
1863 flav = flavors[i++];
1864 if (flav == save)
1865 flav = flavors[i++];
1866 clnt = rpc_clone_client_set_auth(clnt, flav);
1867 if (IS_ERR(clnt)) {
1868 status = PTR_ERR(clnt);
1869 break;
1870 }
1871 clp->cl_rpcclient = clnt;
1872 goto again;
1873
1874 case -NFS4ERR_MINOR_VERS_MISMATCH:
1875 status = -EPROTONOSUPPORT;
1876 break;
1877
1878 case -EKEYEXPIRED:
1879 nfs4_warn_keyexpired(clp->cl_hostname);
1880 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1881 * in nfs4_exchange_id */
1882 status = -EKEYEXPIRED;
1883 }
1884
1885out_unlock:
1886 mutex_unlock(&nfs_clid_init_mutex);
1887out_free:
1888 kfree(flavors);
1889out:
1890 dprintk("NFS: %s: status = %d\n", __func__, status);
1891 return status;
1892}
1893
1701#ifdef CONFIG_NFS_V4_1 1894#ifdef CONFIG_NFS_V4_1
1702void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 1895void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
1703{ 1896{
@@ -2008,6 +2201,7 @@ out_error:
2008 pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" 2201 pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
2009 " with error %d\n", section_sep, section, 2202 " with error %d\n", section_sep, section,
2010 clp->cl_hostname, -status); 2203 clp->cl_hostname, -status);
2204 ssleep(1);
2011 nfs4_end_drain_session(clp); 2205 nfs4_end_drain_session(clp);
2012 nfs4_clear_state_manager_bit(clp); 2206 nfs4_clear_state_manager_bit(clp);
2013} 2207}
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 5729bc8aa75d..2628d921b7e3 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -9,6 +9,7 @@
9#include <linux/nfs_idmap.h> 9#include <linux/nfs_idmap.h>
10#include <linux/nfs_fs.h> 10#include <linux/nfs_fs.h>
11 11
12#include "nfs4_fs.h"
12#include "callback.h" 13#include "callback.h"
13 14
14static const int nfs_set_port_min = 0; 15static const int nfs_set_port_min = 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1bfbd67c556d..40836ee5dc3a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -447,12 +447,14 @@ static int nfs4_stat_to_errno(int);
447 encode_sequence_maxsz + \ 447 encode_sequence_maxsz + \
448 encode_putfh_maxsz + \ 448 encode_putfh_maxsz + \
449 encode_open_maxsz + \ 449 encode_open_maxsz + \
450 encode_access_maxsz + \
450 encode_getfh_maxsz + \ 451 encode_getfh_maxsz + \
451 encode_getattr_maxsz) 452 encode_getattr_maxsz)
452#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ 453#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
453 decode_sequence_maxsz + \ 454 decode_sequence_maxsz + \
454 decode_putfh_maxsz + \ 455 decode_putfh_maxsz + \
455 decode_open_maxsz + \ 456 decode_open_maxsz + \
457 decode_access_maxsz + \
456 decode_getfh_maxsz + \ 458 decode_getfh_maxsz + \
457 decode_getattr_maxsz) 459 decode_getattr_maxsz)
458#define NFS4_enc_open_confirm_sz \ 460#define NFS4_enc_open_confirm_sz \
@@ -467,11 +469,13 @@ static int nfs4_stat_to_errno(int);
467 encode_sequence_maxsz + \ 469 encode_sequence_maxsz + \
468 encode_putfh_maxsz + \ 470 encode_putfh_maxsz + \
469 encode_open_maxsz + \ 471 encode_open_maxsz + \
472 encode_access_maxsz + \
470 encode_getattr_maxsz) 473 encode_getattr_maxsz)
471#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ 474#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
472 decode_sequence_maxsz + \ 475 decode_sequence_maxsz + \
473 decode_putfh_maxsz + \ 476 decode_putfh_maxsz + \
474 decode_open_maxsz + \ 477 decode_open_maxsz + \
478 decode_access_maxsz + \
475 decode_getattr_maxsz) 479 decode_getattr_maxsz)
476#define NFS4_enc_open_downgrade_sz \ 480#define NFS4_enc_open_downgrade_sz \
477 (compound_encode_hdr_maxsz + \ 481 (compound_encode_hdr_maxsz + \
@@ -1509,8 +1513,12 @@ static void encode_open_stateid(struct xdr_stream *xdr,
1509 nfs4_stateid stateid; 1513 nfs4_stateid stateid;
1510 1514
1511 if (ctx->state != NULL) { 1515 if (ctx->state != NULL) {
1516 const struct nfs_lockowner *lockowner = NULL;
1517
1518 if (l_ctx != NULL)
1519 lockowner = &l_ctx->lockowner;
1512 nfs4_select_rw_stateid(&stateid, ctx->state, 1520 nfs4_select_rw_stateid(&stateid, ctx->state,
1513 fmode, l_ctx->lockowner, l_ctx->pid); 1521 fmode, lockowner);
1514 if (zero_seqid) 1522 if (zero_seqid)
1515 stateid.seqid = 0; 1523 stateid.seqid = 0;
1516 encode_nfs4_stateid(xdr, &stateid); 1524 encode_nfs4_stateid(xdr, &stateid);
@@ -2216,6 +2224,8 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2216 encode_putfh(xdr, args->fh, &hdr); 2224 encode_putfh(xdr, args->fh, &hdr);
2217 encode_open(xdr, args, &hdr); 2225 encode_open(xdr, args, &hdr);
2218 encode_getfh(xdr, &hdr); 2226 encode_getfh(xdr, &hdr);
2227 if (args->access)
2228 encode_access(xdr, args->access, &hdr);
2219 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); 2229 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
2220 encode_nops(&hdr); 2230 encode_nops(&hdr);
2221} 2231}
@@ -2252,7 +2262,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
2252 encode_sequence(xdr, &args->seq_args, &hdr); 2262 encode_sequence(xdr, &args->seq_args, &hdr);
2253 encode_putfh(xdr, args->fh, &hdr); 2263 encode_putfh(xdr, args->fh, &hdr);
2254 encode_open(xdr, args, &hdr); 2264 encode_open(xdr, args, &hdr);
2255 encode_getfattr(xdr, args->bitmask, &hdr); 2265 if (args->access)
2266 encode_access(xdr, args->access, &hdr);
2267 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
2256 encode_nops(&hdr); 2268 encode_nops(&hdr);
2257} 2269}
2258 2270
@@ -4095,7 +4107,7 @@ out_overflow:
4095 return -EIO; 4107 return -EIO;
4096} 4108}
4097 4109
4098static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 4110static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
4099{ 4111{
4100 __be32 *p; 4112 __be32 *p;
4101 uint32_t supp, acc; 4113 uint32_t supp, acc;
@@ -4109,8 +4121,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
4109 goto out_overflow; 4121 goto out_overflow;
4110 supp = be32_to_cpup(p++); 4122 supp = be32_to_cpup(p++);
4111 acc = be32_to_cpup(p); 4123 acc = be32_to_cpup(p);
4112 access->supported = supp; 4124 *supported = supp;
4113 access->access = acc; 4125 *access = acc;
4114 return 0; 4126 return 0;
4115out_overflow: 4127out_overflow:
4116 print_overflow_msg(__func__, xdr); 4128 print_overflow_msg(__func__, xdr);
@@ -5072,18 +5084,14 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
5072 * are stored with the acl data to handle the problem of 5084 * are stored with the acl data to handle the problem of
5073 * variable length bitmaps.*/ 5085 * variable length bitmaps.*/
5074 res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; 5086 res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
5075
5076 /* We ignore &savep and don't do consistency checks on
5077 * the attr length. Let userspace figure it out.... */
5078 res->acl_len = attrlen; 5087 res->acl_len = attrlen;
5079 if (attrlen > (xdr->nwords << 2)) { 5088
5080 if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { 5089 /* Check for receive buffer overflow */
5081 /* getxattr interface called with a NULL buf */ 5090 if (res->acl_len > (xdr->nwords << 2) ||
5082 goto out; 5091 res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
5083 } 5092 res->acl_flags |= NFS4_ACL_TRUNC;
5084 dprintk("NFS: acl reply: attrlen %u > page_len %u\n", 5093 dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
5085 attrlen, xdr->nwords << 2); 5094 attrlen, xdr->nwords << 2);
5086 return -EINVAL;
5087 } 5095 }
5088 } else 5096 } else
5089 status = -EOPNOTSUPP; 5097 status = -EOPNOTSUPP;
@@ -5646,7 +5654,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5646 * and places the remaining xdr data in xdr_buf->tail 5654 * and places the remaining xdr data in xdr_buf->tail
5647 */ 5655 */
5648 pdev->mincount = be32_to_cpup(p); 5656 pdev->mincount = be32_to_cpup(p);
5649 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ 5657 if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
5658 goto out_overflow;
5650 5659
5651 /* Parse notification bitmap, verifying that it is zero. */ 5660 /* Parse notification bitmap, verifying that it is zero. */
5652 p = xdr_inline_decode(xdr, 4); 5661 p = xdr_inline_decode(xdr, 4);
@@ -5891,7 +5900,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5891 status = decode_putfh(xdr); 5900 status = decode_putfh(xdr);
5892 if (status != 0) 5901 if (status != 0)
5893 goto out; 5902 goto out;
5894 status = decode_access(xdr, res); 5903 status = decode_access(xdr, &res->supported, &res->access);
5895 if (status != 0) 5904 if (status != 0)
5896 goto out; 5905 goto out;
5897 decode_getfattr(xdr, res->fattr, res->server); 5906 decode_getfattr(xdr, res->fattr, res->server);
@@ -6229,8 +6238,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6229 status = decode_open(xdr, res); 6238 status = decode_open(xdr, res);
6230 if (status) 6239 if (status)
6231 goto out; 6240 goto out;
6232 if (decode_getfh(xdr, &res->fh) != 0) 6241 status = decode_getfh(xdr, &res->fh);
6242 if (status)
6233 goto out; 6243 goto out;
6244 if (res->access_request)
6245 decode_access(xdr, &res->access_supported, &res->access_result);
6234 decode_getfattr(xdr, res->f_attr, res->server); 6246 decode_getfattr(xdr, res->f_attr, res->server);
6235out: 6247out:
6236 return status; 6248 return status;
@@ -6279,6 +6291,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
6279 status = decode_open(xdr, res); 6291 status = decode_open(xdr, res);
6280 if (status) 6292 if (status)
6281 goto out; 6293 goto out;
6294 if (res->access_request)
6295 decode_access(xdr, &res->access_supported, &res->access_result);
6282 decode_getfattr(xdr, res->f_attr, res->server); 6296 decode_getfattr(xdr, res->f_attr, res->server);
6283out: 6297out:
6284 return status; 6298 return status;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ea6d111b03e9..be731e6b7b9c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -41,6 +41,7 @@
41#include <scsi/osd_ore.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44#include "../internal.h"
44 45
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 46#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 47
@@ -606,8 +607,14 @@ static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
606void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 607void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
607{ 608{
608 unsigned long stripe_end = 0; 609 unsigned long stripe_end = 0;
610 u64 wb_size;
609 611
610 pnfs_generic_pg_init_write(pgio, req); 612 if (pgio->pg_dreq == NULL)
613 wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
614 else
615 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
616
617 pnfs_generic_pg_init_write(pgio, req, wb_size);
611 if (unlikely(pgio->pg_lseg == NULL)) 618 if (unlikely(pgio->pg_lseg == NULL))
612 return; /* Not pNFS */ 619 return; /* Not pNFS */
613 620
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 311a79681e2b..e56e846e9d2d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -102,6 +102,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
102 unsigned int offset, unsigned int count) 102 unsigned int offset, unsigned int count)
103{ 103{
104 struct nfs_page *req; 104 struct nfs_page *req;
105 struct nfs_lock_context *l_ctx;
105 106
106 /* try to allocate the request struct */ 107 /* try to allocate the request struct */
107 req = nfs_page_alloc(); 108 req = nfs_page_alloc();
@@ -109,11 +110,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
109 return ERR_PTR(-ENOMEM); 110 return ERR_PTR(-ENOMEM);
110 111
111 /* get lock context early so we can deal with alloc failures */ 112 /* get lock context early so we can deal with alloc failures */
112 req->wb_lock_context = nfs_get_lock_context(ctx); 113 l_ctx = nfs_get_lock_context(ctx);
113 if (req->wb_lock_context == NULL) { 114 if (IS_ERR(l_ctx)) {
114 nfs_page_free(req); 115 nfs_page_free(req);
115 return ERR_PTR(-ENOMEM); 116 return ERR_CAST(l_ctx);
116 } 117 }
118 req->wb_lock_context = l_ctx;
117 119
118 /* Initialize the request struct. Initially, we assume a 120 /* Initialize the request struct. Initially, we assume a
119 * long write-back delay. This will be adjusted in 121 * long write-back delay. This will be adjusted in
@@ -290,7 +292,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
290{ 292{
291 if (req->wb_context->cred != prev->wb_context->cred) 293 if (req->wb_context->cred != prev->wb_context->cred)
292 return false; 294 return false;
293 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) 295 if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner)
296 return false;
297 if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid)
294 return false; 298 return false;
295 if (req->wb_context->state != prev->wb_context->state) 299 if (req->wb_context->state != prev->wb_context->state)
296 return false; 300 return false;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2e00feacd4be..fe624c91bd00 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
35#include "iostat.h" 35#include "iostat.h"
36 36
37#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
38#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
38 39
39/* Locking: 40/* Locking:
40 * 41 *
@@ -190,7 +191,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
190 191
191/* Need to hold i_lock if caller does not already hold reference */ 192/* Need to hold i_lock if caller does not already hold reference */
192void 193void
193get_layout_hdr(struct pnfs_layout_hdr *lo) 194pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
194{ 195{
195 atomic_inc(&lo->plh_refcount); 196 atomic_inc(&lo->plh_refcount);
196} 197}
@@ -199,43 +200,107 @@ static struct pnfs_layout_hdr *
199pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 200pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
200{ 201{
201 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 202 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
202 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : 203 return ld->alloc_layout_hdr(ino, gfp_flags);
203 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
204} 204}
205 205
206static void 206static void
207pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 207pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
208{ 208{
209 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 209 struct nfs_server *server = NFS_SERVER(lo->plh_inode);
210 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
211
212 if (!list_empty(&lo->plh_layouts)) {
213 struct nfs_client *clp = server->nfs_client;
214
215 spin_lock(&clp->cl_lock);
216 list_del_init(&lo->plh_layouts);
217 spin_unlock(&clp->cl_lock);
218 }
210 put_rpccred(lo->plh_lc_cred); 219 put_rpccred(lo->plh_lc_cred);
211 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 220 return ld->free_layout_hdr(lo);
212} 221}
213 222
214static void 223static void
215destroy_layout_hdr(struct pnfs_layout_hdr *lo) 224pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
216{ 225{
226 struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
217 dprintk("%s: freeing layout cache %p\n", __func__, lo); 227 dprintk("%s: freeing layout cache %p\n", __func__, lo);
218 BUG_ON(!list_empty(&lo->plh_layouts)); 228 nfsi->layout = NULL;
219 NFS_I(lo->plh_inode)->layout = NULL; 229 /* Reset MDS Threshold I/O counters */
220 pnfs_free_layout_hdr(lo); 230 nfsi->write_io = 0;
231 nfsi->read_io = 0;
232}
233
234void
235pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
236{
237 struct inode *inode = lo->plh_inode;
238
239 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
240 pnfs_detach_layout_hdr(lo);
241 spin_unlock(&inode->i_lock);
242 pnfs_free_layout_hdr(lo);
243 }
244}
245
246static int
247pnfs_iomode_to_fail_bit(u32 iomode)
248{
249 return iomode == IOMODE_RW ?
250 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
221} 251}
222 252
223static void 253static void
224put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 254pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
225{ 255{
226 if (atomic_dec_and_test(&lo->plh_refcount)) 256 lo->plh_retry_timestamp = jiffies;
227 destroy_layout_hdr(lo); 257 if (test_and_set_bit(fail_bit, &lo->plh_flags))
258 atomic_inc(&lo->plh_refcount);
228} 259}
229 260
230void 261static void
231put_layout_hdr(struct pnfs_layout_hdr *lo) 262pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
263{
264 if (test_and_clear_bit(fail_bit, &lo->plh_flags))
265 atomic_dec(&lo->plh_refcount);
266}
267
268static void
269pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
232{ 270{
233 struct inode *inode = lo->plh_inode; 271 struct inode *inode = lo->plh_inode;
272 struct pnfs_layout_range range = {
273 .iomode = iomode,
274 .offset = 0,
275 .length = NFS4_MAX_UINT64,
276 };
277 LIST_HEAD(head);
234 278
235 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 279 spin_lock(&inode->i_lock);
236 destroy_layout_hdr(lo); 280 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
237 spin_unlock(&inode->i_lock); 281 pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
282 spin_unlock(&inode->i_lock);
283 pnfs_free_lseg_list(&head);
284 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
285 iomode == IOMODE_RW ? "RW" : "READ");
286}
287
288static bool
289pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
290{
291 unsigned long start, end;
292 int fail_bit = pnfs_iomode_to_fail_bit(iomode);
293
294 if (test_bit(fail_bit, &lo->plh_flags) == 0)
295 return false;
296 end = jiffies;
297 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
298 if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
299 /* It is time to retry the failed layoutgets */
300 pnfs_layout_clear_fail_bit(lo, fail_bit);
301 return false;
238 } 302 }
303 return true;
239} 304}
240 305
241static void 306static void
@@ -249,33 +314,32 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
249 lseg->pls_layout = lo; 314 lseg->pls_layout = lo;
250} 315}
251 316
252static void free_lseg(struct pnfs_layout_segment *lseg) 317static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
253{ 318{
254 struct inode *ino = lseg->pls_layout->plh_inode; 319 struct inode *ino = lseg->pls_layout->plh_inode;
255 320
256 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 321 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
257 /* Matched by get_layout_hdr in pnfs_insert_layout */
258 put_layout_hdr(NFS_I(ino)->layout);
259} 322}
260 323
261static void 324static void
262put_lseg_common(struct pnfs_layout_segment *lseg) 325pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
326 struct pnfs_layout_segment *lseg)
263{ 327{
264 struct inode *inode = lseg->pls_layout->plh_inode; 328 struct inode *inode = lo->plh_inode;
265 329
266 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 330 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
267 list_del_init(&lseg->pls_list); 331 list_del_init(&lseg->pls_list);
268 if (list_empty(&lseg->pls_layout->plh_segs)) { 332 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
269 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 333 atomic_dec(&lo->plh_refcount);
270 /* Matched by initial refcount set in alloc_init_layout_hdr */ 334 if (list_empty(&lo->plh_segs))
271 put_layout_hdr_locked(lseg->pls_layout); 335 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
272 }
273 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 336 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
274} 337}
275 338
276void 339void
277put_lseg(struct pnfs_layout_segment *lseg) 340pnfs_put_lseg(struct pnfs_layout_segment *lseg)
278{ 341{
342 struct pnfs_layout_hdr *lo;
279 struct inode *inode; 343 struct inode *inode;
280 344
281 if (!lseg) 345 if (!lseg)
@@ -284,17 +348,17 @@ put_lseg(struct pnfs_layout_segment *lseg)
284 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 348 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
285 atomic_read(&lseg->pls_refcount), 349 atomic_read(&lseg->pls_refcount),
286 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 350 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
287 inode = lseg->pls_layout->plh_inode; 351 lo = lseg->pls_layout;
352 inode = lo->plh_inode;
288 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 353 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
289 LIST_HEAD(free_me); 354 pnfs_get_layout_hdr(lo);
290 355 pnfs_layout_remove_lseg(lo, lseg);
291 put_lseg_common(lseg);
292 list_add(&lseg->pls_list, &free_me);
293 spin_unlock(&inode->i_lock); 356 spin_unlock(&inode->i_lock);
294 pnfs_free_lseg_list(&free_me); 357 pnfs_free_lseg(lseg);
358 pnfs_put_layout_hdr(lo);
295 } 359 }
296} 360}
297EXPORT_SYMBOL_GPL(put_lseg); 361EXPORT_SYMBOL_GPL(pnfs_put_lseg);
298 362
299static inline u64 363static inline u64
300end_offset(u64 start, u64 len) 364end_offset(u64 start, u64 len)
@@ -378,7 +442,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
378 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 442 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
379 atomic_read(&lseg->pls_refcount)); 443 atomic_read(&lseg->pls_refcount));
380 if (atomic_dec_and_test(&lseg->pls_refcount)) { 444 if (atomic_dec_and_test(&lseg->pls_refcount)) {
381 put_lseg_common(lseg); 445 pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
382 list_add(&lseg->pls_list, tmp_list); 446 list_add(&lseg->pls_list, tmp_list);
383 rv = 1; 447 rv = 1;
384 } 448 }
@@ -390,7 +454,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
390 * after call. 454 * after call.
391 */ 455 */
392int 456int
393mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 457pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
394 struct list_head *tmp_list, 458 struct list_head *tmp_list,
395 struct pnfs_layout_range *recall_range) 459 struct pnfs_layout_range *recall_range)
396{ 460{
@@ -399,14 +463,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
399 463
400 dprintk("%s:Begin lo %p\n", __func__, lo); 464 dprintk("%s:Begin lo %p\n", __func__, lo);
401 465
402 if (list_empty(&lo->plh_segs)) { 466 if (list_empty(&lo->plh_segs))
403 /* Reset MDS Threshold I/O counters */
404 NFS_I(lo->plh_inode)->write_io = 0;
405 NFS_I(lo->plh_inode)->read_io = 0;
406 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
407 put_layout_hdr_locked(lo);
408 return 0; 467 return 0;
409 }
410 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 468 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
411 if (!recall_range || 469 if (!recall_range ||
412 should_free_lseg(&lseg->pls_range, recall_range)) { 470 should_free_lseg(&lseg->pls_range, recall_range)) {
@@ -426,25 +484,13 @@ void
426pnfs_free_lseg_list(struct list_head *free_me) 484pnfs_free_lseg_list(struct list_head *free_me)
427{ 485{
428 struct pnfs_layout_segment *lseg, *tmp; 486 struct pnfs_layout_segment *lseg, *tmp;
429 struct pnfs_layout_hdr *lo;
430 487
431 if (list_empty(free_me)) 488 if (list_empty(free_me))
432 return; 489 return;
433 490
434 lo = list_first_entry(free_me, struct pnfs_layout_segment,
435 pls_list)->pls_layout;
436
437 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
438 struct nfs_client *clp;
439
440 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
441 spin_lock(&clp->cl_lock);
442 list_del_init(&lo->plh_layouts);
443 spin_unlock(&clp->cl_lock);
444 }
445 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 491 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
446 list_del(&lseg->pls_list); 492 list_del(&lseg->pls_list);
447 free_lseg(lseg); 493 pnfs_free_lseg(lseg);
448 } 494 }
449} 495}
450 496
@@ -458,10 +504,15 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
458 lo = nfsi->layout; 504 lo = nfsi->layout;
459 if (lo) { 505 if (lo) {
460 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 506 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
461 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 507 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
462 } 508 pnfs_get_layout_hdr(lo);
463 spin_unlock(&nfsi->vfs_inode.i_lock); 509 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
464 pnfs_free_lseg_list(&tmp_list); 510 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
511 spin_unlock(&nfsi->vfs_inode.i_lock);
512 pnfs_free_lseg_list(&tmp_list);
513 pnfs_put_layout_hdr(lo);
514 } else
515 spin_unlock(&nfsi->vfs_inode.i_lock);
465} 516}
466EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 517EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
467 518
@@ -498,46 +549,54 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
498 } 549 }
499} 550}
500 551
552/*
553 * Compare 2 layout stateid sequence ids, to see which is newer,
554 * taking into account wraparound issues.
555 */
556static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
557{
558 return (s32)s1 - (s32)s2 > 0;
559}
560
501/* update lo->plh_stateid with new if is more recent */ 561/* update lo->plh_stateid with new if is more recent */
502void 562void
503pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 563pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
504 bool update_barrier) 564 bool update_barrier)
505{ 565{
506 u32 oldseq, newseq; 566 u32 oldseq, newseq, new_barrier;
567 int empty = list_empty(&lo->plh_segs);
507 568
508 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 569 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
509 newseq = be32_to_cpu(new->seqid); 570 newseq = be32_to_cpu(new->seqid);
510 if ((int)(newseq - oldseq) > 0) { 571 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
511 nfs4_stateid_copy(&lo->plh_stateid, new); 572 nfs4_stateid_copy(&lo->plh_stateid, new);
512 if (update_barrier) { 573 if (update_barrier) {
513 u32 new_barrier = be32_to_cpu(new->seqid); 574 new_barrier = be32_to_cpu(new->seqid);
514
515 if ((int)(new_barrier - lo->plh_barrier))
516 lo->plh_barrier = new_barrier;
517 } else { 575 } else {
518 /* Because of wraparound, we want to keep the barrier 576 /* Because of wraparound, we want to keep the barrier
519 * "close" to the current seqids. It needs to be 577 * "close" to the current seqids.
520 * within 2**31 to count as "behind", so if it
521 * gets too near that limit, give us a litle leeway
522 * and bring it to within 2**30.
523 * NOTE - and yes, this is all unsigned arithmetic.
524 */ 578 */
525 if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) 579 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
526 lo->plh_barrier = newseq - (1 << 30);
527 } 580 }
581 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
582 lo->plh_barrier = new_barrier;
528 } 583 }
529} 584}
530 585
586static bool
587pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
588 const nfs4_stateid *stateid)
589{
590 u32 seqid = be32_to_cpu(stateid->seqid);
591
592 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
593}
594
531/* lget is set to 1 if called from inside send_layoutget call chain */ 595/* lget is set to 1 if called from inside send_layoutget call chain */
532static bool 596static bool
533pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, 597pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
534 int lget)
535{ 598{
536 if ((stateid) &&
537 (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
538 return true;
539 return lo->plh_block_lgets || 599 return lo->plh_block_lgets ||
540 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
541 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 600 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
542 (list_empty(&lo->plh_segs) && 601 (list_empty(&lo->plh_segs) &&
543 (atomic_read(&lo->plh_outstanding) > lget)); 602 (atomic_read(&lo->plh_outstanding) > lget));
@@ -551,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
551 610
552 dprintk("--> %s\n", __func__); 611 dprintk("--> %s\n", __func__);
553 spin_lock(&lo->plh_inode->i_lock); 612 spin_lock(&lo->plh_inode->i_lock);
554 if (pnfs_layoutgets_blocked(lo, NULL, 1)) { 613 if (pnfs_layoutgets_blocked(lo, 1)) {
555 status = -EAGAIN; 614 status = -EAGAIN;
556 } else if (list_empty(&lo->plh_segs)) { 615 } else if (list_empty(&lo->plh_segs)) {
557 int seq; 616 int seq;
@@ -582,7 +641,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
582 struct inode *ino = lo->plh_inode; 641 struct inode *ino = lo->plh_inode;
583 struct nfs_server *server = NFS_SERVER(ino); 642 struct nfs_server *server = NFS_SERVER(ino);
584 struct nfs4_layoutget *lgp; 643 struct nfs4_layoutget *lgp;
585 struct pnfs_layout_segment *lseg = NULL; 644 struct pnfs_layout_segment *lseg;
586 645
587 dprintk("--> %s\n", __func__); 646 dprintk("--> %s\n", __func__);
588 647
@@ -599,16 +658,22 @@ send_layoutget(struct pnfs_layout_hdr *lo,
599 lgp->args.type = server->pnfs_curr_ld->id; 658 lgp->args.type = server->pnfs_curr_ld->id;
600 lgp->args.inode = ino; 659 lgp->args.inode = ino;
601 lgp->args.ctx = get_nfs_open_context(ctx); 660 lgp->args.ctx = get_nfs_open_context(ctx);
602 lgp->lsegpp = &lseg;
603 lgp->gfp_flags = gfp_flags; 661 lgp->gfp_flags = gfp_flags;
604 662
605 /* Synchronously retrieve layout information from server and 663 /* Synchronously retrieve layout information from server and
606 * store in lseg. 664 * store in lseg.
607 */ 665 */
608 nfs4_proc_layoutget(lgp, gfp_flags); 666 lseg = nfs4_proc_layoutget(lgp, gfp_flags);
609 if (!lseg) { 667 if (IS_ERR(lseg)) {
610 /* remember that LAYOUTGET failed and suspend trying */ 668 switch (PTR_ERR(lseg)) {
611 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); 669 case -ENOMEM:
670 case -ERESTARTSYS:
671 break;
672 default:
673 /* remember that LAYOUTGET failed and suspend trying */
674 pnfs_layout_io_set_failed(lo, range->iomode);
675 }
676 return NULL;
612 } 677 }
613 678
614 return lseg; 679 return lseg;
@@ -636,25 +701,24 @@ _pnfs_return_layout(struct inode *ino)
636 701
637 spin_lock(&ino->i_lock); 702 spin_lock(&ino->i_lock);
638 lo = nfsi->layout; 703 lo = nfsi->layout;
639 if (!lo || pnfs_test_layout_returned(lo)) { 704 if (!lo) {
640 spin_unlock(&ino->i_lock); 705 spin_unlock(&ino->i_lock);
641 dprintk("NFS: %s no layout to return\n", __func__); 706 dprintk("NFS: %s no layout to return\n", __func__);
642 goto out; 707 goto out;
643 } 708 }
644 stateid = nfsi->layout->plh_stateid; 709 stateid = nfsi->layout->plh_stateid;
645 /* Reference matched in nfs4_layoutreturn_release */ 710 /* Reference matched in nfs4_layoutreturn_release */
646 get_layout_hdr(lo); 711 pnfs_get_layout_hdr(lo);
647 empty = list_empty(&lo->plh_segs); 712 empty = list_empty(&lo->plh_segs);
648 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 713 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
649 /* Don't send a LAYOUTRETURN if list was initially empty */ 714 /* Don't send a LAYOUTRETURN if list was initially empty */
650 if (empty) { 715 if (empty) {
651 spin_unlock(&ino->i_lock); 716 spin_unlock(&ino->i_lock);
652 put_layout_hdr(lo); 717 pnfs_put_layout_hdr(lo);
653 dprintk("NFS: %s no layout segments to return\n", __func__); 718 dprintk("NFS: %s no layout segments to return\n", __func__);
654 goto out; 719 goto out;
655 } 720 }
656 lo->plh_block_lgets++; 721 lo->plh_block_lgets++;
657 pnfs_mark_layout_returned(lo);
658 spin_unlock(&ino->i_lock); 722 spin_unlock(&ino->i_lock);
659 pnfs_free_lseg_list(&tmp_list); 723 pnfs_free_lseg_list(&tmp_list);
660 724
@@ -663,10 +727,10 @@ _pnfs_return_layout(struct inode *ino)
663 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 727 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
664 if (unlikely(lrp == NULL)) { 728 if (unlikely(lrp == NULL)) {
665 status = -ENOMEM; 729 status = -ENOMEM;
666 set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); 730 spin_lock(&ino->i_lock);
667 set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); 731 lo->plh_block_lgets--;
668 pnfs_clear_layout_returned(lo); 732 spin_unlock(&ino->i_lock);
669 put_layout_hdr(lo); 733 pnfs_put_layout_hdr(lo);
670 goto out; 734 goto out;
671 } 735 }
672 736
@@ -703,7 +767,7 @@ bool pnfs_roc(struct inode *ino)
703 if (!found) 767 if (!found)
704 goto out_nolayout; 768 goto out_nolayout;
705 lo->plh_block_lgets++; 769 lo->plh_block_lgets++;
706 get_layout_hdr(lo); /* matched in pnfs_roc_release */ 770 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
707 spin_unlock(&ino->i_lock); 771 spin_unlock(&ino->i_lock);
708 pnfs_free_lseg_list(&tmp_list); 772 pnfs_free_lseg_list(&tmp_list);
709 return true; 773 return true;
@@ -720,8 +784,12 @@ void pnfs_roc_release(struct inode *ino)
720 spin_lock(&ino->i_lock); 784 spin_lock(&ino->i_lock);
721 lo = NFS_I(ino)->layout; 785 lo = NFS_I(ino)->layout;
722 lo->plh_block_lgets--; 786 lo->plh_block_lgets--;
723 put_layout_hdr_locked(lo); 787 if (atomic_dec_and_test(&lo->plh_refcount)) {
724 spin_unlock(&ino->i_lock); 788 pnfs_detach_layout_hdr(lo);
789 spin_unlock(&ino->i_lock);
790 pnfs_free_layout_hdr(lo);
791 } else
792 spin_unlock(&ino->i_lock);
725} 793}
726 794
727void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 795void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
@@ -730,32 +798,34 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
730 798
731 spin_lock(&ino->i_lock); 799 spin_lock(&ino->i_lock);
732 lo = NFS_I(ino)->layout; 800 lo = NFS_I(ino)->layout;
733 if ((int)(barrier - lo->plh_barrier) > 0) 801 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
734 lo->plh_barrier = barrier; 802 lo->plh_barrier = barrier;
735 spin_unlock(&ino->i_lock); 803 spin_unlock(&ino->i_lock);
736} 804}
737 805
738bool pnfs_roc_drain(struct inode *ino, u32 *barrier) 806bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
739{ 807{
740 struct nfs_inode *nfsi = NFS_I(ino); 808 struct nfs_inode *nfsi = NFS_I(ino);
809 struct pnfs_layout_hdr *lo;
741 struct pnfs_layout_segment *lseg; 810 struct pnfs_layout_segment *lseg;
811 u32 current_seqid;
742 bool found = false; 812 bool found = false;
743 813
744 spin_lock(&ino->i_lock); 814 spin_lock(&ino->i_lock);
745 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 815 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
746 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 816 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
817 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
747 found = true; 818 found = true;
748 break; 819 goto out;
749 } 820 }
750 if (!found) { 821 lo = nfsi->layout;
751 struct pnfs_layout_hdr *lo = nfsi->layout; 822 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
752 u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
753 823
754 /* Since close does not return a layout stateid for use as 824 /* Since close does not return a layout stateid for use as
755 * a barrier, we choose the worst-case barrier. 825 * a barrier, we choose the worst-case barrier.
756 */ 826 */
757 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 827 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
758 } 828out:
759 spin_unlock(&ino->i_lock); 829 spin_unlock(&ino->i_lock);
760 return found; 830 return found;
761} 831}
@@ -786,14 +856,13 @@ cmp_layout(struct pnfs_layout_range *l1,
786} 856}
787 857
788static void 858static void
789pnfs_insert_layout(struct pnfs_layout_hdr *lo, 859pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
790 struct pnfs_layout_segment *lseg) 860 struct pnfs_layout_segment *lseg)
791{ 861{
792 struct pnfs_layout_segment *lp; 862 struct pnfs_layout_segment *lp;
793 863
794 dprintk("%s:Begin\n", __func__); 864 dprintk("%s:Begin\n", __func__);
795 865
796 assert_spin_locked(&lo->plh_inode->i_lock);
797 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 866 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
798 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 867 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
799 continue; 868 continue;
@@ -813,7 +882,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
813 __func__, lseg, lseg->pls_range.iomode, 882 __func__, lseg, lseg->pls_range.iomode,
814 lseg->pls_range.offset, lseg->pls_range.length); 883 lseg->pls_range.offset, lseg->pls_range.length);
815out: 884out:
816 get_layout_hdr(lo); 885 pnfs_get_layout_hdr(lo);
817 886
818 dprintk("%s:Return\n", __func__); 887 dprintk("%s:Return\n", __func__);
819} 888}
@@ -847,21 +916,19 @@ pnfs_find_alloc_layout(struct inode *ino,
847 916
848 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 917 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
849 918
850 assert_spin_locked(&ino->i_lock); 919 if (nfsi->layout != NULL)
851 if (nfsi->layout) { 920 goto out_existing;
852 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
853 return NULL;
854 else
855 return nfsi->layout;
856 }
857 spin_unlock(&ino->i_lock); 921 spin_unlock(&ino->i_lock);
858 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 922 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
859 spin_lock(&ino->i_lock); 923 spin_lock(&ino->i_lock);
860 924
861 if (likely(nfsi->layout == NULL)) /* Won the race? */ 925 if (likely(nfsi->layout == NULL)) { /* Won the race? */
862 nfsi->layout = new; 926 nfsi->layout = new;
863 else 927 return new;
864 pnfs_free_layout_hdr(new); 928 }
929 pnfs_free_layout_hdr(new);
930out_existing:
931 pnfs_get_layout_hdr(nfsi->layout);
865 return nfsi->layout; 932 return nfsi->layout;
866} 933}
867 934
@@ -904,11 +971,10 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
904 971
905 dprintk("%s:Begin\n", __func__); 972 dprintk("%s:Begin\n", __func__);
906 973
907 assert_spin_locked(&lo->plh_inode->i_lock);
908 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 974 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
909 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 975 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
910 is_matching_lseg(&lseg->pls_range, range)) { 976 is_matching_lseg(&lseg->pls_range, range)) {
911 ret = get_lseg(lseg); 977 ret = pnfs_get_lseg(lseg);
912 break; 978 break;
913 } 979 }
914 if (lseg->pls_range.offset > range->offset) 980 if (lseg->pls_range.offset > range->offset)
@@ -1013,7 +1079,6 @@ pnfs_update_layout(struct inode *ino,
1013 .length = count, 1079 .length = count,
1014 }; 1080 };
1015 unsigned pg_offset; 1081 unsigned pg_offset;
1016 struct nfs_inode *nfsi = NFS_I(ino);
1017 struct nfs_server *server = NFS_SERVER(ino); 1082 struct nfs_server *server = NFS_SERVER(ino);
1018 struct nfs_client *clp = server->nfs_client; 1083 struct nfs_client *clp = server->nfs_client;
1019 struct pnfs_layout_hdr *lo; 1084 struct pnfs_layout_hdr *lo;
@@ -1021,16 +1086,16 @@ pnfs_update_layout(struct inode *ino,
1021 bool first = false; 1086 bool first = false;
1022 1087
1023 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1088 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1024 return NULL; 1089 goto out;
1025 1090
1026 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1091 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1027 return NULL; 1092 goto out;
1028 1093
1029 spin_lock(&ino->i_lock); 1094 spin_lock(&ino->i_lock);
1030 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1095 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1031 if (lo == NULL) { 1096 if (lo == NULL) {
1032 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 1097 spin_unlock(&ino->i_lock);
1033 goto out_unlock; 1098 goto out;
1034 } 1099 }
1035 1100
1036 /* Do we even need to bother with this? */ 1101 /* Do we even need to bother with this? */
@@ -1040,7 +1105,7 @@ pnfs_update_layout(struct inode *ino,
1040 } 1105 }
1041 1106
1042 /* if LAYOUTGET already failed once we don't try again */ 1107 /* if LAYOUTGET already failed once we don't try again */
1043 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 1108 if (pnfs_layout_io_test_failed(lo, iomode))
1044 goto out_unlock; 1109 goto out_unlock;
1045 1110
1046 /* Check to see if the layout for the given range already exists */ 1111 /* Check to see if the layout for the given range already exists */
@@ -1048,17 +1113,13 @@ pnfs_update_layout(struct inode *ino,
1048 if (lseg) 1113 if (lseg)
1049 goto out_unlock; 1114 goto out_unlock;
1050 1115
1051 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 1116 if (pnfs_layoutgets_blocked(lo, 0))
1052 goto out_unlock; 1117 goto out_unlock;
1053 atomic_inc(&lo->plh_outstanding); 1118 atomic_inc(&lo->plh_outstanding);
1054 1119
1055 get_layout_hdr(lo);
1056 if (list_empty(&lo->plh_segs)) 1120 if (list_empty(&lo->plh_segs))
1057 first = true; 1121 first = true;
1058 1122
1059 /* Enable LAYOUTRETURNs */
1060 pnfs_clear_layout_returned(lo);
1061
1062 spin_unlock(&ino->i_lock); 1123 spin_unlock(&ino->i_lock);
1063 if (first) { 1124 if (first) {
1064 /* The lo must be on the clp list if there is any 1125 /* The lo must be on the clp list if there is any
@@ -1079,24 +1140,26 @@ pnfs_update_layout(struct inode *ino,
1079 arg.length = PAGE_CACHE_ALIGN(arg.length); 1140 arg.length = PAGE_CACHE_ALIGN(arg.length);
1080 1141
1081 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1142 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1082 if (!lseg && first) {
1083 spin_lock(&clp->cl_lock);
1084 list_del_init(&lo->plh_layouts);
1085 spin_unlock(&clp->cl_lock);
1086 }
1087 atomic_dec(&lo->plh_outstanding); 1143 atomic_dec(&lo->plh_outstanding);
1088 put_layout_hdr(lo); 1144out_put_layout_hdr:
1145 pnfs_put_layout_hdr(lo);
1089out: 1146out:
1090 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 1147 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1091 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); 1148 "(%s, offset: %llu, length: %llu)\n",
1149 __func__, ino->i_sb->s_id,
1150 (unsigned long long)NFS_FILEID(ino),
1151 lseg == NULL ? "not found" : "found",
1152 iomode==IOMODE_RW ? "read/write" : "read-only",
1153 (unsigned long long)pos,
1154 (unsigned long long)count);
1092 return lseg; 1155 return lseg;
1093out_unlock: 1156out_unlock:
1094 spin_unlock(&ino->i_lock); 1157 spin_unlock(&ino->i_lock);
1095 goto out; 1158 goto out_put_layout_hdr;
1096} 1159}
1097EXPORT_SYMBOL_GPL(pnfs_update_layout); 1160EXPORT_SYMBOL_GPL(pnfs_update_layout);
1098 1161
1099int 1162struct pnfs_layout_segment *
1100pnfs_layout_process(struct nfs4_layoutget *lgp) 1163pnfs_layout_process(struct nfs4_layoutget *lgp)
1101{ 1164{
1102 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1165 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
@@ -1123,25 +1186,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1123 goto out_forget_reply; 1186 goto out_forget_reply;
1124 } 1187 }
1125 1188
1126 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { 1189 if (pnfs_layoutgets_blocked(lo, 1) ||
1190 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1127 dprintk("%s forget reply due to state\n", __func__); 1191 dprintk("%s forget reply due to state\n", __func__);
1128 goto out_forget_reply; 1192 goto out_forget_reply;
1129 } 1193 }
1194
1195 /* Done processing layoutget. Set the layout stateid */
1196 pnfs_set_layout_stateid(lo, &res->stateid, false);
1197
1130 init_lseg(lo, lseg); 1198 init_lseg(lo, lseg);
1131 lseg->pls_range = res->range; 1199 lseg->pls_range = res->range;
1132 *lgp->lsegpp = get_lseg(lseg); 1200 pnfs_get_lseg(lseg);
1133 pnfs_insert_layout(lo, lseg); 1201 pnfs_layout_insert_lseg(lo, lseg);
1134 1202
1135 if (res->return_on_close) { 1203 if (res->return_on_close) {
1136 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1204 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1137 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1205 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1138 } 1206 }
1139 1207
1140 /* Done processing layoutget. Set the layout stateid */
1141 pnfs_set_layout_stateid(lo, &res->stateid, false);
1142 spin_unlock(&ino->i_lock); 1208 spin_unlock(&ino->i_lock);
1209 return lseg;
1143out: 1210out:
1144 return status; 1211 return ERR_PTR(status);
1145 1212
1146out_forget_reply: 1213out_forget_reply:
1147 spin_unlock(&ino->i_lock); 1214 spin_unlock(&ino->i_lock);
@@ -1153,16 +1220,24 @@ out_forget_reply:
1153void 1220void
1154pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1221pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1155{ 1222{
1223 u64 rd_size = req->wb_bytes;
1224
1156 BUG_ON(pgio->pg_lseg != NULL); 1225 BUG_ON(pgio->pg_lseg != NULL);
1157 1226
1158 if (req->wb_offset != req->wb_pgbase) { 1227 if (req->wb_offset != req->wb_pgbase) {
1159 nfs_pageio_reset_read_mds(pgio); 1228 nfs_pageio_reset_read_mds(pgio);
1160 return; 1229 return;
1161 } 1230 }
1231
1232 if (pgio->pg_dreq == NULL)
1233 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1234 else
1235 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1236
1162 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1237 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1163 req->wb_context, 1238 req->wb_context,
1164 req_offset(req), 1239 req_offset(req),
1165 req->wb_bytes, 1240 rd_size,
1166 IOMODE_READ, 1241 IOMODE_READ,
1167 GFP_KERNEL); 1242 GFP_KERNEL);
1168 /* If no lseg, fall back to read through mds */ 1243 /* If no lseg, fall back to read through mds */
@@ -1173,7 +1248,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
1173EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1248EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1174 1249
1175void 1250void
1176pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1251pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1252 struct nfs_page *req, u64 wb_size)
1177{ 1253{
1178 BUG_ON(pgio->pg_lseg != NULL); 1254 BUG_ON(pgio->pg_lseg != NULL);
1179 1255
@@ -1181,10 +1257,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
1181 nfs_pageio_reset_write_mds(pgio); 1257 nfs_pageio_reset_write_mds(pgio);
1182 return; 1258 return;
1183 } 1259 }
1260
1184 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1261 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1185 req->wb_context, 1262 req->wb_context,
1186 req_offset(req), 1263 req_offset(req),
1187 req->wb_bytes, 1264 wb_size,
1188 IOMODE_RW, 1265 IOMODE_RW,
1189 GFP_NOFS); 1266 GFP_NOFS);
1190 /* If no lseg, fall back to write through mds */ 1267 /* If no lseg, fall back to write through mds */
@@ -1362,12 +1439,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
1362 if (trypnfs == PNFS_NOT_ATTEMPTED) 1439 if (trypnfs == PNFS_NOT_ATTEMPTED)
1363 pnfs_write_through_mds(desc, data); 1440 pnfs_write_through_mds(desc, data);
1364 } 1441 }
1365 put_lseg(lseg); 1442 pnfs_put_lseg(lseg);
1366} 1443}
1367 1444
1368static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1445static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1369{ 1446{
1370 put_lseg(hdr->lseg); 1447 pnfs_put_lseg(hdr->lseg);
1371 nfs_writehdr_free(hdr); 1448 nfs_writehdr_free(hdr);
1372} 1449}
1373EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1450EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
@@ -1382,17 +1459,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1382 whdr = nfs_writehdr_alloc(); 1459 whdr = nfs_writehdr_alloc();
1383 if (!whdr) { 1460 if (!whdr) {
1384 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1461 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1385 put_lseg(desc->pg_lseg); 1462 pnfs_put_lseg(desc->pg_lseg);
1386 desc->pg_lseg = NULL; 1463 desc->pg_lseg = NULL;
1387 return -ENOMEM; 1464 return -ENOMEM;
1388 } 1465 }
1389 hdr = &whdr->header; 1466 hdr = &whdr->header;
1390 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1467 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1391 hdr->lseg = get_lseg(desc->pg_lseg); 1468 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1392 atomic_inc(&hdr->refcnt); 1469 atomic_inc(&hdr->refcnt);
1393 ret = nfs_generic_flush(desc, hdr); 1470 ret = nfs_generic_flush(desc, hdr);
1394 if (ret != 0) { 1471 if (ret != 0) {
1395 put_lseg(desc->pg_lseg); 1472 pnfs_put_lseg(desc->pg_lseg);
1396 desc->pg_lseg = NULL; 1473 desc->pg_lseg = NULL;
1397 } else 1474 } else
1398 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); 1475 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
@@ -1517,12 +1594,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
1517 if (trypnfs == PNFS_NOT_ATTEMPTED) 1594 if (trypnfs == PNFS_NOT_ATTEMPTED)
1518 pnfs_read_through_mds(desc, data); 1595 pnfs_read_through_mds(desc, data);
1519 } 1596 }
1520 put_lseg(lseg); 1597 pnfs_put_lseg(lseg);
1521} 1598}
1522 1599
1523static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1600static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1524{ 1601{
1525 put_lseg(hdr->lseg); 1602 pnfs_put_lseg(hdr->lseg);
1526 nfs_readhdr_free(hdr); 1603 nfs_readhdr_free(hdr);
1527} 1604}
1528EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1605EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
@@ -1538,17 +1615,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1538 if (!rhdr) { 1615 if (!rhdr) {
1539 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1616 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1540 ret = -ENOMEM; 1617 ret = -ENOMEM;
1541 put_lseg(desc->pg_lseg); 1618 pnfs_put_lseg(desc->pg_lseg);
1542 desc->pg_lseg = NULL; 1619 desc->pg_lseg = NULL;
1543 return ret; 1620 return ret;
1544 } 1621 }
1545 hdr = &rhdr->header; 1622 hdr = &rhdr->header;
1546 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1623 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1547 hdr->lseg = get_lseg(desc->pg_lseg); 1624 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1548 atomic_inc(&hdr->refcnt); 1625 atomic_inc(&hdr->refcnt);
1549 ret = nfs_generic_pagein(desc, hdr); 1626 ret = nfs_generic_pagein(desc, hdr);
1550 if (ret != 0) { 1627 if (ret != 0) {
1551 put_lseg(desc->pg_lseg); 1628 pnfs_put_lseg(desc->pg_lseg);
1552 desc->pg_lseg = NULL; 1629 desc->pg_lseg = NULL;
1553 } else 1630 } else
1554 pnfs_do_multiple_reads(desc, &hdr->rpc_list); 1631 pnfs_do_multiple_reads(desc, &hdr->rpc_list);
@@ -1574,13 +1651,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1574 1651
1575void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1652void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1576{ 1653{
1577 if (lseg->pls_range.iomode == IOMODE_RW) { 1654 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
1578 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1579 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1580 } else {
1581 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1582 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1583 }
1584} 1655}
1585EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1656EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1586 1657
@@ -1601,7 +1672,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1601 } 1672 }
1602 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 1673 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
1603 /* references matched in nfs4_layoutcommit_release */ 1674 /* references matched in nfs4_layoutcommit_release */
1604 get_lseg(hdr->lseg); 1675 pnfs_get_lseg(hdr->lseg);
1605 } 1676 }
1606 if (end_pos > nfsi->layout->plh_lwb) 1677 if (end_pos > nfsi->layout->plh_lwb)
1607 nfsi->layout->plh_lwb = end_pos; 1678 nfsi->layout->plh_lwb = end_pos;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 745aa1b39e7c..2d722dba1111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -62,9 +62,6 @@ enum {
62 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 62 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
63 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 63 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
64 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 64 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
65 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
66 NFS_LAYOUT_INVALID, /* layout is being destroyed */
67 NFS_LAYOUT_RETURNED, /* layout has already been returned */
68}; 65};
69 66
70enum layoutdriver_policy_flags { 67enum layoutdriver_policy_flags {
@@ -140,6 +137,7 @@ struct pnfs_layout_hdr {
140 atomic_t plh_outstanding; /* number of RPCs out */ 137 atomic_t plh_outstanding; /* number of RPCs out */
141 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 138 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
142 u32 plh_barrier; /* ignore lower seqids */ 139 u32 plh_barrier; /* ignore lower seqids */
140 unsigned long plh_retry_timestamp;
143 unsigned long plh_flags; 141 unsigned long plh_flags;
144 loff_t plh_lwb; /* last write byte for layoutcommit */ 142 loff_t plh_lwb; /* last write byte for layoutcommit */
145 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 143 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -172,12 +170,12 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
172 struct pnfs_devicelist *devlist); 170 struct pnfs_devicelist *devlist);
173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 171extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
174 struct pnfs_device *dev); 172 struct pnfs_device *dev);
175extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 173extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
176extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 174extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
177 175
178/* pnfs.c */ 176/* pnfs.c */
179void get_layout_hdr(struct pnfs_layout_hdr *lo); 177void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
180void put_lseg(struct pnfs_layout_segment *lseg); 178void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
181 179
182void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, 180void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
183 const struct nfs_pgio_completion_ops *); 181 const struct nfs_pgio_completion_ops *);
@@ -188,28 +186,29 @@ void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
188void unset_pnfs_layoutdriver(struct nfs_server *); 186void unset_pnfs_layoutdriver(struct nfs_server *);
189void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); 187void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
190int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 188int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
191void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); 189void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
190 struct nfs_page *req, u64 wb_size);
192int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 191int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
193bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 192bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
194void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); 193void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
195int pnfs_layout_process(struct nfs4_layoutget *lgp); 194struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
196void pnfs_free_lseg_list(struct list_head *tmp_list); 195void pnfs_free_lseg_list(struct list_head *tmp_list);
197void pnfs_destroy_layout(struct nfs_inode *); 196void pnfs_destroy_layout(struct nfs_inode *);
198void pnfs_destroy_all_layouts(struct nfs_client *); 197void pnfs_destroy_all_layouts(struct nfs_client *);
199void put_layout_hdr(struct pnfs_layout_hdr *lo); 198void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
200void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 199void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
201 const nfs4_stateid *new, 200 const nfs4_stateid *new,
202 bool update_barrier); 201 bool update_barrier);
203int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, 202int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
204 struct pnfs_layout_hdr *lo, 203 struct pnfs_layout_hdr *lo,
205 struct nfs4_state *open_state); 204 struct nfs4_state *open_state);
206int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 205int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
207 struct list_head *tmp_list, 206 struct list_head *tmp_list,
208 struct pnfs_layout_range *recall_range); 207 struct pnfs_layout_range *recall_range);
209bool pnfs_roc(struct inode *ino); 208bool pnfs_roc(struct inode *ino);
210void pnfs_roc_release(struct inode *ino); 209void pnfs_roc_release(struct inode *ino);
211void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 210void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
212bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 211bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
213void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 212void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
214void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 213void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
215int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 214int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -233,6 +232,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
233/* nfs4_deviceid_flags */ 232/* nfs4_deviceid_flags */
234enum { 233enum {
235 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ 234 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
235 NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
236}; 236};
237 237
238/* pnfs_dev.c */ 238/* pnfs_dev.c */
@@ -242,6 +242,7 @@ struct nfs4_deviceid_node {
242 const struct pnfs_layoutdriver_type *ld; 242 const struct pnfs_layoutdriver_type *ld;
243 const struct nfs_client *nfs_client; 243 const struct nfs_client *nfs_client;
244 unsigned long flags; 244 unsigned long flags;
245 unsigned long timestamp_unavailable;
245 struct nfs4_deviceid deviceid; 246 struct nfs4_deviceid deviceid;
246 atomic_t ref; 247 atomic_t ref;
247}; 248};
@@ -254,34 +255,12 @@ void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
254 const struct nfs4_deviceid *); 255 const struct nfs4_deviceid *);
255struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); 256struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
256bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 257bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
258void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
259bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
257void nfs4_deviceid_purge_client(const struct nfs_client *); 260void nfs4_deviceid_purge_client(const struct nfs_client *);
258 261
259static inline void
260pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo)
261{
262 set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
263}
264
265static inline void
266pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo)
267{
268 clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
269}
270
271static inline bool
272pnfs_test_layout_returned(struct pnfs_layout_hdr *lo)
273{
274 return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
275}
276
277static inline int lo_fail_bit(u32 iomode)
278{
279 return iomode == IOMODE_RW ?
280 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
281}
282
283static inline struct pnfs_layout_segment * 262static inline struct pnfs_layout_segment *
284get_lseg(struct pnfs_layout_segment *lseg) 263pnfs_get_lseg(struct pnfs_layout_segment *lseg)
285{ 264{
286 if (lseg) { 265 if (lseg) {
287 atomic_inc(&lseg->pls_refcount); 266 atomic_inc(&lseg->pls_refcount);
@@ -406,12 +385,12 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
406} 385}
407 386
408static inline struct pnfs_layout_segment * 387static inline struct pnfs_layout_segment *
409get_lseg(struct pnfs_layout_segment *lseg) 388pnfs_get_lseg(struct pnfs_layout_segment *lseg)
410{ 389{
411 return NULL; 390 return NULL;
412} 391}
413 392
414static inline void put_lseg(struct pnfs_layout_segment *lseg) 393static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
415{ 394{
416} 395}
417 396
@@ -443,7 +422,7 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
443} 422}
444 423
445static inline bool 424static inline bool
446pnfs_roc_drain(struct inode *ino, u32 *barrier) 425pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
447{ 426{
448 return false; 427 return false;
449} 428}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 73f701f1f4d3..d35b62e83ea6 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -40,6 +40,8 @@
40#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) 40#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
41#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) 41#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
42 42
43#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
44
43static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; 45static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
44static DEFINE_SPINLOCK(nfs4_deviceid_lock); 46static DEFINE_SPINLOCK(nfs4_deviceid_lock);
45 47
@@ -218,6 +220,30 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
218} 220}
219EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); 221EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
220 222
223void
224nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
225{
226 node->timestamp_unavailable = jiffies;
227 set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
228}
229EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
230
231bool
232nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
233{
234 if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
235 unsigned long start, end;
236
237 end = jiffies;
238 start = end - PNFS_DEVICE_RETRY_TIMEOUT;
239 if (time_in_range(node->timestamp_unavailable, start, end))
240 return true;
241 clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
242 }
243 return false;
244}
245EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
246
221static void 247static void
222_deviceid_purge_client(const struct nfs_client *clp, long hash) 248_deviceid_purge_client(const struct nfs_client *clp, long hash)
223{ 249{
@@ -276,3 +302,4 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
276 } 302 }
277 rcu_read_unlock(); 303 rcu_read_unlock();
278} 304}
305
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 239aff7338eb..e831bce49766 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -88,6 +88,7 @@ enum {
88 Opt_sharecache, Opt_nosharecache, 88 Opt_sharecache, Opt_nosharecache,
89 Opt_resvport, Opt_noresvport, 89 Opt_resvport, Opt_noresvport,
90 Opt_fscache, Opt_nofscache, 90 Opt_fscache, Opt_nofscache,
91 Opt_migration, Opt_nomigration,
91 92
92 /* Mount options that take integer arguments */ 93 /* Mount options that take integer arguments */
93 Opt_port, 94 Opt_port,
@@ -147,6 +148,8 @@ static const match_table_t nfs_mount_option_tokens = {
147 { Opt_noresvport, "noresvport" }, 148 { Opt_noresvport, "noresvport" },
148 { Opt_fscache, "fsc" }, 149 { Opt_fscache, "fsc" },
149 { Opt_nofscache, "nofsc" }, 150 { Opt_nofscache, "nofsc" },
151 { Opt_migration, "migration" },
152 { Opt_nomigration, "nomigration" },
150 153
151 { Opt_port, "port=%s" }, 154 { Opt_port, "port=%s" },
152 { Opt_rsize, "rsize=%s" }, 155 { Opt_rsize, "rsize=%s" },
@@ -676,6 +679,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
676 if (nfss->options & NFS_OPTION_FSCACHE) 679 if (nfss->options & NFS_OPTION_FSCACHE)
677 seq_printf(m, ",fsc"); 680 seq_printf(m, ",fsc");
678 681
682 if (nfss->options & NFS_OPTION_MIGRATION)
683 seq_printf(m, ",migration");
684
679 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { 685 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
680 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) 686 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
681 seq_printf(m, ",lookupcache=none"); 687 seq_printf(m, ",lookupcache=none");
@@ -1106,7 +1112,7 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1106 string = match_strdup(args); 1112 string = match_strdup(args);
1107 if (string == NULL) 1113 if (string == NULL)
1108 return -ENOMEM; 1114 return -ENOMEM;
1109 rc = strict_strtoul(string, 10, option); 1115 rc = kstrtoul(string, 10, option);
1110 kfree(string); 1116 kfree(string);
1111 1117
1112 return rc; 1118 return rc;
@@ -1243,6 +1249,12 @@ static int nfs_parse_mount_options(char *raw,
1243 kfree(mnt->fscache_uniq); 1249 kfree(mnt->fscache_uniq);
1244 mnt->fscache_uniq = NULL; 1250 mnt->fscache_uniq = NULL;
1245 break; 1251 break;
1252 case Opt_migration:
1253 mnt->options |= NFS_OPTION_MIGRATION;
1254 break;
1255 case Opt_nomigration:
1256 mnt->options &= NFS_OPTION_MIGRATION;
1257 break;
1246 1258
1247 /* 1259 /*
1248 * options that take numeric values 1260 * options that take numeric values
@@ -1535,9 +1547,13 @@ static int nfs_parse_mount_options(char *raw,
1535 if (mnt->minorversion && mnt->version != 4) 1547 if (mnt->minorversion && mnt->version != 4)
1536 goto out_minorversion_mismatch; 1548 goto out_minorversion_mismatch;
1537 1549
1550 if (mnt->options & NFS_OPTION_MIGRATION &&
1551 mnt->version != 4 && mnt->minorversion != 0)
1552 goto out_migration_misuse;
1553
1538 /* 1554 /*
1539 * verify that any proto=/mountproto= options match the address 1555 * verify that any proto=/mountproto= options match the address
1540 * familiies in the addr=/mountaddr= options. 1556 * families in the addr=/mountaddr= options.
1541 */ 1557 */
1542 if (protofamily != AF_UNSPEC && 1558 if (protofamily != AF_UNSPEC &&
1543 protofamily != mnt->nfs_server.address.ss_family) 1559 protofamily != mnt->nfs_server.address.ss_family)
@@ -1572,6 +1588,10 @@ out_minorversion_mismatch:
1572 printk(KERN_INFO "NFS: mount option vers=%u does not support " 1588 printk(KERN_INFO "NFS: mount option vers=%u does not support "
1573 "minorversion=%u\n", mnt->version, mnt->minorversion); 1589 "minorversion=%u\n", mnt->version, mnt->minorversion);
1574 return 0; 1590 return 0;
1591out_migration_misuse:
1592 printk(KERN_INFO
1593 "NFS: 'migration' not supported for this NFS version\n");
1594 return 0;
1575out_nomem: 1595out_nomem:
1576 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1596 printk(KERN_INFO "NFS: not enough memory to parse option\n");
1577 return 0; 1597 return 0;
@@ -1867,6 +1887,7 @@ static int nfs23_validate_mount_data(void *options,
1867 1887
1868 memcpy(sap, &data->addr, sizeof(data->addr)); 1888 memcpy(sap, &data->addr, sizeof(data->addr));
1869 args->nfs_server.addrlen = sizeof(data->addr); 1889 args->nfs_server.addrlen = sizeof(data->addr);
1890 args->nfs_server.port = ntohs(data->addr.sin_port);
1870 if (!nfs_verify_server_address(sap)) 1891 if (!nfs_verify_server_address(sap))
1871 goto out_no_address; 1892 goto out_no_address;
1872 1893
@@ -2493,7 +2514,7 @@ EXPORT_SYMBOL_GPL(nfs_kill_super);
2493/* 2514/*
2494 * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) 2515 * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
2495 */ 2516 */
2496struct dentry * 2517static struct dentry *
2497nfs_xdev_mount(struct file_system_type *fs_type, int flags, 2518nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2498 const char *dev_name, void *raw_data) 2519 const char *dev_name, void *raw_data)
2499{ 2520{
@@ -2564,6 +2585,7 @@ static int nfs4_validate_mount_data(void *options,
2564 return -EFAULT; 2585 return -EFAULT;
2565 if (!nfs_verify_server_address(sap)) 2586 if (!nfs_verify_server_address(sap))
2566 goto out_no_address; 2587 goto out_no_address;
2588 args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
2567 2589
2568 if (data->auth_flavourlen) { 2590 if (data->auth_flavourlen) {
2569 if (data->auth_flavourlen > 1) 2591 if (data->auth_flavourlen > 1)
@@ -2640,6 +2662,7 @@ unsigned int nfs_idmap_cache_timeout = 600;
2640bool nfs4_disable_idmapping = true; 2662bool nfs4_disable_idmapping = true;
2641unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; 2663unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
2642unsigned short send_implementation_id = 1; 2664unsigned short send_implementation_id = 1;
2665char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
2643 2666
2644EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); 2667EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
2645EXPORT_SYMBOL_GPL(nfs_callback_tcpport); 2668EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
@@ -2647,6 +2670,7 @@ EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
2647EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); 2670EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
2648EXPORT_SYMBOL_GPL(max_session_slots); 2671EXPORT_SYMBOL_GPL(max_session_slots);
2649EXPORT_SYMBOL_GPL(send_implementation_id); 2672EXPORT_SYMBOL_GPL(send_implementation_id);
2673EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
2650 2674
2651#define NFS_CALLBACK_MAXPORTNR (65535U) 2675#define NFS_CALLBACK_MAXPORTNR (65535U)
2652 2676
@@ -2657,7 +2681,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
2657 2681
2658 if (!val) 2682 if (!val)
2659 return -EINVAL; 2683 return -EINVAL;
2660 ret = strict_strtoul(val, 0, &num); 2684 ret = kstrtoul(val, 0, &num);
2661 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) 2685 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
2662 return -EINVAL; 2686 return -EINVAL;
2663 *((unsigned int *)kp->arg) = num; 2687 *((unsigned int *)kp->arg) = num;
@@ -2672,6 +2696,8 @@ static struct kernel_param_ops param_ops_portnr = {
2672module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); 2696module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
2673module_param(nfs_idmap_cache_timeout, int, 0644); 2697module_param(nfs_idmap_cache_timeout, int, 0644);
2674module_param(nfs4_disable_idmapping, bool, 0644); 2698module_param(nfs4_disable_idmapping, bool, 0644);
2699module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
2700 NFS4_CLIENT_ID_UNIQ_LEN, 0600);
2675MODULE_PARM_DESC(nfs4_disable_idmapping, 2701MODULE_PARM_DESC(nfs4_disable_idmapping,
2676 "Turn off NFSv4 idmapping when using 'sec=sys'"); 2702 "Turn off NFSv4 idmapping when using 'sec=sys'");
2677module_param(max_session_slots, ushort, 0644); 2703module_param(max_session_slots, ushort, 0644);
@@ -2680,6 +2706,7 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
2680module_param(send_implementation_id, ushort, 0644); 2706module_param(send_implementation_id, ushort, 0644);
2681MODULE_PARM_DESC(send_implementation_id, 2707MODULE_PARM_DESC(send_implementation_id,
2682 "Send implementation ID with NFSv4.1 exchange_id"); 2708 "Send implementation ID with NFSv4.1 exchange_id");
2709MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
2683MODULE_ALIAS("nfs4"); 2710MODULE_ALIAS("nfs4");
2684 2711
2685#endif /* CONFIG_NFS_V4 */ 2712#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b55372726c..9347ab7c9574 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -846,6 +846,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
846int nfs_flush_incompatible(struct file *file, struct page *page) 846int nfs_flush_incompatible(struct file *file, struct page *page)
847{ 847{
848 struct nfs_open_context *ctx = nfs_file_open_context(file); 848 struct nfs_open_context *ctx = nfs_file_open_context(file);
849 struct nfs_lock_context *l_ctx;
849 struct nfs_page *req; 850 struct nfs_page *req;
850 int do_flush, status; 851 int do_flush, status;
851 /* 852 /*
@@ -860,9 +861,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
860 req = nfs_page_find_request(page); 861 req = nfs_page_find_request(page);
861 if (req == NULL) 862 if (req == NULL)
862 return 0; 863 return 0;
863 do_flush = req->wb_page != page || req->wb_context != ctx || 864 l_ctx = req->wb_lock_context;
864 req->wb_lock_context->lockowner != current->files || 865 do_flush = req->wb_page != page || req->wb_context != ctx;
865 req->wb_lock_context->pid != current->tgid; 866 if (l_ctx) {
867 do_flush |= l_ctx->lockowner.l_owner != current->files
868 || l_ctx->lockowner.l_pid != current->tgid;
869 }
866 nfs_release_request(req); 870 nfs_release_request(req);
867 if (!do_flush) 871 if (!do_flush)
868 return 0; 872 return 0;
@@ -1576,6 +1580,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1576 /* We have a mismatch. Write the page again */ 1580 /* We have a mismatch. Write the page again */
1577 dprintk(" mismatch\n"); 1581 dprintk(" mismatch\n");
1578 nfs_mark_request_dirty(req); 1582 nfs_mark_request_dirty(req);
1583 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
1579 next: 1584 next:
1580 nfs_unlock_and_release_request(req); 1585 nfs_unlock_and_release_request(req);
1581 } 1586 }
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 6aa5590c3679..b314888825d5 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -218,8 +218,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
218 * There must be an encoding function for void results so svc_process 218 * There must be an encoding function for void results so svc_process
219 * will work properly. 219 * will work properly.
220 */ 220 */
221int 221static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
222nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
223{ 222{
224 return xdr_ressize_check(rqstp, p); 223 return xdr_ressize_check(rqstp, p);
225} 224}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9095f3c21df9..97d90d1c8608 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -247,7 +247,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
247 /* Now create the file and set attributes */ 247 /* Now create the file and set attributes */
248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, 248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
249 attr, newfhp, 249 attr, newfhp,
250 argp->createmode, argp->verf, NULL, NULL); 250 argp->createmode, (u32 *)argp->verf, NULL, NULL);
251 251
252 RETURN_STATUS(nfserr); 252 RETURN_STATUS(nfserr);
253} 253}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4c7bd35b1876..bdf29c96e4cd 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1028,7 +1028,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
1028 cb->cb_msg.rpc_cred = callback_cred; 1028 cb->cb_msg.rpc_cred = callback_cred;
1029 1029
1030 cb->cb_ops = &nfsd4_cb_recall_ops; 1030 cb->cb_ops = &nfsd4_cb_recall_ops;
1031 dp->dl_retries = 1;
1032 1031
1033 INIT_LIST_HEAD(&cb->cb_per_client); 1032 INIT_LIST_HEAD(&cb->cb_per_client);
1034 cb->cb_done = true; 1033 cb->cb_done = true;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index fdc91a6fc9c4..a1f10c0a6255 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -478,7 +478,7 @@ nfsd_idmap_init(struct net *net)
478 goto destroy_idtoname_cache; 478 goto destroy_idtoname_cache;
479 nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); 479 nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net);
480 if (IS_ERR(nn->nametoid_cache)) { 480 if (IS_ERR(nn->nametoid_cache)) {
481 rv = PTR_ERR(nn->idtoname_cache); 481 rv = PTR_ERR(nn->nametoid_cache);
482 goto unregister_idtoname_cache; 482 goto unregister_idtoname_cache;
483 } 483 }
484 rv = cache_register_net(nn->nametoid_cache, net); 484 rv = cache_register_net(nn->nametoid_cache, net);
@@ -598,7 +598,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
598 /* Just to make sure it's null-terminated: */ 598 /* Just to make sure it's null-terminated: */
599 memcpy(buf, name, namelen); 599 memcpy(buf, name, namelen);
600 buf[namelen] = '\0'; 600 buf[namelen] = '\0';
601 ret = kstrtouint(name, 10, id); 601 ret = kstrtouint(buf, 10, id);
602 return ret == 0; 602 return ret == 0;
603} 603}
604 604
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c9c1c0a25417..6c9a4b291dba 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -370,7 +370,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
370 break; 370 break;
371 case NFS4_OPEN_CLAIM_PREVIOUS: 371 case NFS4_OPEN_CLAIM_PREVIOUS:
372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
373 status = nfs4_check_open_reclaim(&open->op_clientid); 373 status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
374 if (status) 374 if (status)
375 goto out; 375 goto out;
376 case NFS4_OPEN_CLAIM_FH: 376 case NFS4_OPEN_CLAIM_FH:
@@ -1054,8 +1054,8 @@ struct nfsd4_operation {
1054 char *op_name; 1054 char *op_name;
1055 /* Try to get response size before operation */ 1055 /* Try to get response size before operation */
1056 nfsd4op_rsize op_rsize_bop; 1056 nfsd4op_rsize op_rsize_bop;
1057 stateid_setter op_get_currentstateid; 1057 stateid_getter op_get_currentstateid;
1058 stateid_getter op_set_currentstateid; 1058 stateid_setter op_set_currentstateid;
1059}; 1059};
1060 1060
1061static struct nfsd4_operation nfsd4_ops[]; 1061static struct nfsd4_operation nfsd4_ops[];
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cc894eda385a..d0237f872cc4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -758,7 +758,7 @@ static void nfsd4_put_drc_mem(int slotsize, int num)
758 spin_unlock(&nfsd_drc_lock); 758 spin_unlock(&nfsd_drc_lock);
759} 759}
760 760
761static struct nfsd4_session *alloc_session(int slotsize, int numslots) 761static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
762{ 762{
763 struct nfsd4_session *new; 763 struct nfsd4_session *new;
764 int mem, i; 764 int mem, i;
@@ -852,35 +852,28 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
852 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 852 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
853} 853}
854 854
855static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir) 855static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses)
856{ 856{
857 struct nfsd4_conn *conn;
858 int ret; 857 int ret;
859 858
860 conn = alloc_conn(rqstp, dir);
861 if (!conn)
862 return nfserr_jukebox;
863 nfsd4_hash_conn(conn, ses); 859 nfsd4_hash_conn(conn, ses);
864 ret = nfsd4_register_conn(conn); 860 ret = nfsd4_register_conn(conn);
865 if (ret) 861 if (ret)
866 /* oops; xprt is already down: */ 862 /* oops; xprt is already down: */
867 nfsd4_conn_lost(&conn->cn_xpt_user); 863 nfsd4_conn_lost(&conn->cn_xpt_user);
868 if (ses->se_client->cl_cb_state == NFSD4_CB_DOWN && 864 if (conn->cn_flags & NFS4_CDFC4_BACK) {
869 dir & NFS4_CDFC4_BACK) {
870 /* callback channel may be back up */ 865 /* callback channel may be back up */
871 nfsd4_probe_callback(ses->se_client); 866 nfsd4_probe_callback(ses->se_client);
872 } 867 }
873 return nfs_ok;
874} 868}
875 869
876static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses) 870static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
877{ 871{
878 u32 dir = NFS4_CDFC4_FORE; 872 u32 dir = NFS4_CDFC4_FORE;
879 873
880 if (ses->se_flags & SESSION4_BACK_CHAN) 874 if (cses->flags & SESSION4_BACK_CHAN)
881 dir |= NFS4_CDFC4_BACK; 875 dir |= NFS4_CDFC4_BACK;
882 876 return alloc_conn(rqstp, dir);
883 return nfsd4_new_conn(rqstp, ses, dir);
884} 877}
885 878
886/* must be called under client_lock */ 879/* must be called under client_lock */
@@ -903,20 +896,21 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
903 spin_unlock(&clp->cl_lock); 896 spin_unlock(&clp->cl_lock);
904} 897}
905 898
899static void __free_session(struct nfsd4_session *ses)
900{
901 nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
902 free_session_slots(ses);
903 kfree(ses);
904}
905
906static void free_session(struct kref *kref) 906static void free_session(struct kref *kref)
907{ 907{
908 struct nfsd4_session *ses; 908 struct nfsd4_session *ses;
909 int mem;
910 909
911 lockdep_assert_held(&client_lock); 910 lockdep_assert_held(&client_lock);
912 ses = container_of(kref, struct nfsd4_session, se_ref); 911 ses = container_of(kref, struct nfsd4_session, se_ref);
913 nfsd4_del_conns(ses); 912 nfsd4_del_conns(ses);
914 spin_lock(&nfsd_drc_lock); 913 __free_session(ses);
915 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
916 nfsd_drc_mem_used -= mem;
917 spin_unlock(&nfsd_drc_lock);
918 free_session_slots(ses);
919 kfree(ses);
920} 914}
921 915
922void nfsd4_put_session(struct nfsd4_session *ses) 916void nfsd4_put_session(struct nfsd4_session *ses)
@@ -926,14 +920,10 @@ void nfsd4_put_session(struct nfsd4_session *ses)
926 spin_unlock(&client_lock); 920 spin_unlock(&client_lock);
927} 921}
928 922
929static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) 923static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
930{ 924{
931 struct nfsd4_session *new; 925 struct nfsd4_session *new;
932 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
933 int numslots, slotsize; 926 int numslots, slotsize;
934 __be32 status;
935 int idx;
936
937 /* 927 /*
938 * Note decreasing slot size below client's request may 928 * Note decreasing slot size below client's request may
939 * make it difficult for client to function correctly, whereas 929 * make it difficult for client to function correctly, whereas
@@ -946,12 +936,18 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
946 if (numslots < 1) 936 if (numslots < 1)
947 return NULL; 937 return NULL;
948 938
949 new = alloc_session(slotsize, numslots); 939 new = __alloc_session(slotsize, numslots);
950 if (!new) { 940 if (!new) {
951 nfsd4_put_drc_mem(slotsize, fchan->maxreqs); 941 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
952 return NULL; 942 return NULL;
953 } 943 }
954 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); 944 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
945 return new;
946}
947
948static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
949{
950 int idx;
955 951
956 new->se_client = clp; 952 new->se_client = clp;
957 gen_sessionid(new); 953 gen_sessionid(new);
@@ -970,14 +966,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
970 spin_unlock(&clp->cl_lock); 966 spin_unlock(&clp->cl_lock);
971 spin_unlock(&client_lock); 967 spin_unlock(&client_lock);
972 968
973 status = nfsd4_new_conn_from_crses(rqstp, new);
974 /* whoops: benny points out, status is ignored! (err, or bogus) */
975 if (status) {
976 spin_lock(&client_lock);
977 free_session(&new->se_ref);
978 spin_unlock(&client_lock);
979 return NULL;
980 }
981 if (cses->flags & SESSION4_BACK_CHAN) { 969 if (cses->flags & SESSION4_BACK_CHAN) {
982 struct sockaddr *sa = svc_addr(rqstp); 970 struct sockaddr *sa = svc_addr(rqstp);
983 /* 971 /*
@@ -990,7 +978,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
990 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 978 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
991 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 979 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
992 } 980 }
993 nfsd4_probe_callback(clp);
994 return new; 981 return new;
995} 982}
996 983
@@ -1131,7 +1118,7 @@ unhash_client_locked(struct nfs4_client *clp)
1131} 1118}
1132 1119
1133static void 1120static void
1134expire_client(struct nfs4_client *clp) 1121destroy_client(struct nfs4_client *clp)
1135{ 1122{
1136 struct nfs4_openowner *oo; 1123 struct nfs4_openowner *oo;
1137 struct nfs4_delegation *dp; 1124 struct nfs4_delegation *dp;
@@ -1165,6 +1152,12 @@ expire_client(struct nfs4_client *clp)
1165 spin_unlock(&client_lock); 1152 spin_unlock(&client_lock);
1166} 1153}
1167 1154
1155static void expire_client(struct nfs4_client *clp)
1156{
1157 nfsd4_client_record_remove(clp);
1158 destroy_client(clp);
1159}
1160
1168static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 1161static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
1169{ 1162{
1170 memcpy(target->cl_verifier.data, source->data, 1163 memcpy(target->cl_verifier.data, source->data,
@@ -1223,10 +1216,26 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
1223 return true; 1216 return true;
1224} 1217}
1225 1218
1219/*
1220 * RFC 3530 language requires clid_inuse be returned when the
1221 * "principal" associated with a requests differs from that previously
1222 * used. We use uid, gid's, and gss principal string as our best
1223 * approximation. We also don't want to allow non-gss use of a client
1224 * established using gss: in theory cr_principal should catch that
1225 * change, but in practice cr_principal can be null even in the gss case
1226 * since gssd doesn't always pass down a principal string.
1227 */
1228static bool is_gss_cred(struct svc_cred *cr)
1229{
1230 /* Is cr_flavor one of the gss "pseudoflavors"?: */
1231 return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
1232}
1233
1234
1226static bool 1235static bool
1227same_creds(struct svc_cred *cr1, struct svc_cred *cr2) 1236same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1228{ 1237{
1229 if ((cr1->cr_flavor != cr2->cr_flavor) 1238 if ((is_gss_cred(cr1) != is_gss_cred(cr2))
1230 || (cr1->cr_uid != cr2->cr_uid) 1239 || (cr1->cr_uid != cr2->cr_uid)
1231 || (cr1->cr_gid != cr2->cr_gid) 1240 || (cr1->cr_gid != cr2->cr_gid)
1232 || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) 1241 || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
@@ -1340,13 +1349,15 @@ move_to_confirmed(struct nfs4_client *clp)
1340} 1349}
1341 1350
1342static struct nfs4_client * 1351static struct nfs4_client *
1343find_confirmed_client(clientid_t *clid) 1352find_confirmed_client(clientid_t *clid, bool sessions)
1344{ 1353{
1345 struct nfs4_client *clp; 1354 struct nfs4_client *clp;
1346 unsigned int idhashval = clientid_hashval(clid->cl_id); 1355 unsigned int idhashval = clientid_hashval(clid->cl_id);
1347 1356
1348 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1357 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
1349 if (same_clid(&clp->cl_clientid, clid)) { 1358 if (same_clid(&clp->cl_clientid, clid)) {
1359 if ((bool)clp->cl_minorversion != sessions)
1360 return NULL;
1350 renew_client(clp); 1361 renew_client(clp);
1351 return clp; 1362 return clp;
1352 } 1363 }
@@ -1355,14 +1366,17 @@ find_confirmed_client(clientid_t *clid)
1355} 1366}
1356 1367
1357static struct nfs4_client * 1368static struct nfs4_client *
1358find_unconfirmed_client(clientid_t *clid) 1369find_unconfirmed_client(clientid_t *clid, bool sessions)
1359{ 1370{
1360 struct nfs4_client *clp; 1371 struct nfs4_client *clp;
1361 unsigned int idhashval = clientid_hashval(clid->cl_id); 1372 unsigned int idhashval = clientid_hashval(clid->cl_id);
1362 1373
1363 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { 1374 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
1364 if (same_clid(&clp->cl_clientid, clid)) 1375 if (same_clid(&clp->cl_clientid, clid)) {
1376 if ((bool)clp->cl_minorversion != sessions)
1377 return NULL;
1365 return clp; 1378 return clp;
1379 }
1366 } 1380 }
1367 return NULL; 1381 return NULL;
1368} 1382}
@@ -1651,6 +1665,7 @@ out_new:
1651 status = nfserr_jukebox; 1665 status = nfserr_jukebox;
1652 goto out; 1666 goto out;
1653 } 1667 }
1668 new->cl_minorversion = 1;
1654 1669
1655 gen_clid(new); 1670 gen_clid(new);
1656 add_to_unconfirmed(new, strhashval); 1671 add_to_unconfirmed(new, strhashval);
@@ -1743,67 +1758,71 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1743 struct sockaddr *sa = svc_addr(rqstp); 1758 struct sockaddr *sa = svc_addr(rqstp);
1744 struct nfs4_client *conf, *unconf; 1759 struct nfs4_client *conf, *unconf;
1745 struct nfsd4_session *new; 1760 struct nfsd4_session *new;
1761 struct nfsd4_conn *conn;
1746 struct nfsd4_clid_slot *cs_slot = NULL; 1762 struct nfsd4_clid_slot *cs_slot = NULL;
1747 bool confirm_me = false;
1748 __be32 status = 0; 1763 __be32 status = 0;
1749 1764
1750 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1765 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1751 return nfserr_inval; 1766 return nfserr_inval;
1767 if (check_forechannel_attrs(cr_ses->fore_channel))
1768 return nfserr_toosmall;
1769 new = alloc_session(&cr_ses->fore_channel);
1770 if (!new)
1771 return nfserr_jukebox;
1772 status = nfserr_jukebox;
1773 conn = alloc_conn_from_crses(rqstp, cr_ses);
1774 if (!conn)
1775 goto out_free_session;
1752 1776
1753 nfs4_lock_state(); 1777 nfs4_lock_state();
1754 unconf = find_unconfirmed_client(&cr_ses->clientid); 1778 unconf = find_unconfirmed_client(&cr_ses->clientid, true);
1755 conf = find_confirmed_client(&cr_ses->clientid); 1779 conf = find_confirmed_client(&cr_ses->clientid, true);
1756 1780
1757 if (conf) { 1781 if (conf) {
1758 cs_slot = &conf->cl_cs_slot; 1782 cs_slot = &conf->cl_cs_slot;
1759 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1783 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1760 if (status == nfserr_replay_cache) { 1784 if (status == nfserr_replay_cache) {
1761 status = nfsd4_replay_create_session(cr_ses, cs_slot); 1785 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1762 goto out; 1786 goto out_free_conn;
1763 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { 1787 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1764 status = nfserr_seq_misordered; 1788 status = nfserr_seq_misordered;
1765 goto out; 1789 goto out_free_conn;
1766 } 1790 }
1767 } else if (unconf) { 1791 } else if (unconf) {
1792 unsigned int hash;
1793 struct nfs4_client *old;
1768 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1794 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1769 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1795 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
1770 status = nfserr_clid_inuse; 1796 status = nfserr_clid_inuse;
1771 goto out; 1797 goto out_free_conn;
1772 } 1798 }
1773 cs_slot = &unconf->cl_cs_slot; 1799 cs_slot = &unconf->cl_cs_slot;
1774 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1800 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1775 if (status) { 1801 if (status) {
1776 /* an unconfirmed replay returns misordered */ 1802 /* an unconfirmed replay returns misordered */
1777 status = nfserr_seq_misordered; 1803 status = nfserr_seq_misordered;
1778 goto out; 1804 goto out_free_conn;
1779 } 1805 }
1780 confirm_me = true; 1806 hash = clientstr_hashval(unconf->cl_recdir);
1807 old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
1808 if (old)
1809 expire_client(old);
1810 move_to_confirmed(unconf);
1781 conf = unconf; 1811 conf = unconf;
1782 } else { 1812 } else {
1783 status = nfserr_stale_clientid; 1813 status = nfserr_stale_clientid;
1784 goto out; 1814 goto out_free_conn;
1785 } 1815 }
1786 1816 status = nfs_ok;
1787 /*
1788 * XXX: we should probably set this at creation time, and check
1789 * for consistent minorversion use throughout:
1790 */
1791 conf->cl_minorversion = 1;
1792 /* 1817 /*
1793 * We do not support RDMA or persistent sessions 1818 * We do not support RDMA or persistent sessions
1794 */ 1819 */
1795 cr_ses->flags &= ~SESSION4_PERSIST; 1820 cr_ses->flags &= ~SESSION4_PERSIST;
1796 cr_ses->flags &= ~SESSION4_RDMA; 1821 cr_ses->flags &= ~SESSION4_RDMA;
1797 1822
1798 status = nfserr_toosmall; 1823 init_session(rqstp, new, conf, cr_ses);
1799 if (check_forechannel_attrs(cr_ses->fore_channel)) 1824 nfsd4_init_conn(rqstp, conn, new);
1800 goto out;
1801 1825
1802 status = nfserr_jukebox;
1803 new = alloc_init_session(rqstp, conf, cr_ses);
1804 if (!new)
1805 goto out;
1806 status = nfs_ok;
1807 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 1826 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1808 NFS4_MAX_SESSIONID_LEN); 1827 NFS4_MAX_SESSIONID_LEN);
1809 memcpy(&cr_ses->fore_channel, &new->se_fchannel, 1828 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
@@ -1813,18 +1832,15 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1813 1832
1814 /* cache solo and embedded create sessions under the state lock */ 1833 /* cache solo and embedded create sessions under the state lock */
1815 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1834 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1816 if (confirm_me) {
1817 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
1818 struct nfs4_client *old =
1819 find_confirmed_client_by_str(conf->cl_recdir, hash);
1820 if (old)
1821 expire_client(old);
1822 move_to_confirmed(conf);
1823 }
1824out: 1835out:
1825 nfs4_unlock_state(); 1836 nfs4_unlock_state();
1826 dprintk("%s returns %d\n", __func__, ntohl(status)); 1837 dprintk("%s returns %d\n", __func__, ntohl(status));
1827 return status; 1838 return status;
1839out_free_conn:
1840 free_conn(conn);
1841out_free_session:
1842 __free_session(new);
1843 goto out;
1828} 1844}
1829 1845
1830static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) 1846static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
@@ -1854,6 +1870,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1854 struct nfsd4_bind_conn_to_session *bcts) 1870 struct nfsd4_bind_conn_to_session *bcts)
1855{ 1871{
1856 __be32 status; 1872 __be32 status;
1873 struct nfsd4_conn *conn;
1857 1874
1858 if (!nfsd4_last_compound_op(rqstp)) 1875 if (!nfsd4_last_compound_op(rqstp))
1859 return nfserr_not_only_op; 1876 return nfserr_not_only_op;
@@ -1870,9 +1887,13 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1870 return nfserr_badsession; 1887 return nfserr_badsession;
1871 1888
1872 status = nfsd4_map_bcts_dir(&bcts->dir); 1889 status = nfsd4_map_bcts_dir(&bcts->dir);
1873 if (!status) 1890 if (status)
1874 nfsd4_new_conn(rqstp, cstate->session, bcts->dir); 1891 return status;
1875 return status; 1892 conn = alloc_conn(rqstp, bcts->dir);
1893 if (!conn)
1894 return nfserr_jukebox;
1895 nfsd4_init_conn(rqstp, conn, cstate->session);
1896 return nfs_ok;
1876} 1897}
1877 1898
1878static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1899static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -2085,8 +2106,8 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2085 __be32 status = 0; 2106 __be32 status = 0;
2086 2107
2087 nfs4_lock_state(); 2108 nfs4_lock_state();
2088 unconf = find_unconfirmed_client(&dc->clientid); 2109 unconf = find_unconfirmed_client(&dc->clientid, true);
2089 conf = find_confirmed_client(&dc->clientid); 2110 conf = find_confirmed_client(&dc->clientid, true);
2090 2111
2091 if (conf) { 2112 if (conf) {
2092 clp = conf; 2113 clp = conf;
@@ -2200,10 +2221,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2200 copy_clid(new, conf); 2221 copy_clid(new, conf);
2201 else /* case 4 (new client) or cases 2, 3 (client reboot): */ 2222 else /* case 4 (new client) or cases 2, 3 (client reboot): */
2202 gen_clid(new); 2223 gen_clid(new);
2203 /*
2204 * XXX: we should probably set this at creation time, and check
2205 * for consistent minorversion use throughout:
2206 */
2207 new->cl_minorversion = 0; 2224 new->cl_minorversion = 0;
2208 gen_callback(new, setclid, rqstp); 2225 gen_callback(new, setclid, rqstp);
2209 add_to_unconfirmed(new, strhashval); 2226 add_to_unconfirmed(new, strhashval);
@@ -2232,8 +2249,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2232 return nfserr_stale_clientid; 2249 return nfserr_stale_clientid;
2233 nfs4_lock_state(); 2250 nfs4_lock_state();
2234 2251
2235 conf = find_confirmed_client(clid); 2252 conf = find_confirmed_client(clid, false);
2236 unconf = find_unconfirmed_client(clid); 2253 unconf = find_unconfirmed_client(clid, false);
2237 /* 2254 /*
2238 * We try hard to give out unique clientid's, so if we get an 2255 * We try hard to give out unique clientid's, so if we get an
2239 * attempt to confirm the same clientid with a different cred, 2256 * attempt to confirm the same clientid with a different cred,
@@ -2262,10 +2279,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2262 unsigned int hash = clientstr_hashval(unconf->cl_recdir); 2279 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
2263 2280
2264 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); 2281 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
2265 if (conf) { 2282 if (conf)
2266 nfsd4_client_record_remove(conf);
2267 expire_client(conf); 2283 expire_client(conf);
2268 }
2269 move_to_confirmed(unconf); 2284 move_to_confirmed(unconf);
2270 nfsd4_probe_callback(unconf); 2285 nfsd4_probe_callback(unconf);
2271 } 2286 }
@@ -2447,16 +2462,20 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2447} 2462}
2448 2463
2449static struct nfs4_openowner * 2464static struct nfs4_openowner *
2450find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) 2465find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
2451{ 2466{
2452 struct nfs4_stateowner *so; 2467 struct nfs4_stateowner *so;
2453 struct nfs4_openowner *oo; 2468 struct nfs4_openowner *oo;
2469 struct nfs4_client *clp;
2454 2470
2455 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2471 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
2456 if (!so->so_is_open_owner) 2472 if (!so->so_is_open_owner)
2457 continue; 2473 continue;
2458 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2474 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2459 oo = openowner(so); 2475 oo = openowner(so);
2476 clp = oo->oo_owner.so_client;
2477 if ((bool)clp->cl_minorversion != sessions)
2478 return NULL;
2460 renew_client(oo->oo_owner.so_client); 2479 renew_client(oo->oo_owner.so_client);
2461 return oo; 2480 return oo;
2462 } 2481 }
@@ -2600,10 +2619,10 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2600 return nfserr_jukebox; 2619 return nfserr_jukebox;
2601 2620
2602 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); 2621 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2603 oo = find_openstateowner_str(strhashval, open); 2622 oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
2604 open->op_openowner = oo; 2623 open->op_openowner = oo;
2605 if (!oo) { 2624 if (!oo) {
2606 clp = find_confirmed_client(clientid); 2625 clp = find_confirmed_client(clientid, cstate->minorversion);
2607 if (clp == NULL) 2626 if (clp == NULL)
2608 return nfserr_expired; 2627 return nfserr_expired;
2609 goto new_owner; 2628 goto new_owner;
@@ -2705,11 +2724,6 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st
2705 return nfs_ok; 2724 return nfs_ok;
2706} 2725}
2707 2726
2708static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
2709{
2710 kmem_cache_free(stateid_slab, s);
2711}
2712
2713static inline int nfs4_access_to_access(u32 nfs4_access) 2727static inline int nfs4_access_to_access(u32 nfs4_access)
2714{ 2728{
2715 int flags = 0; 2729 int flags = 0;
@@ -2837,8 +2851,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2837 return -ENOMEM; 2851 return -ENOMEM;
2838 } 2852 }
2839 fp->fi_lease = fl; 2853 fp->fi_lease = fl;
2840 fp->fi_deleg_file = fl->fl_file; 2854 fp->fi_deleg_file = get_file(fl->fl_file);
2841 get_file(fp->fi_deleg_file);
2842 atomic_set(&fp->fi_delegees, 1); 2855 atomic_set(&fp->fi_delegees, 1);
2843 list_add(&dp->dl_perfile, &fp->fi_delegations); 2856 list_add(&dp->dl_perfile, &fp->fi_delegations);
2844 return 0; 2857 return 0;
@@ -3088,7 +3101,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3088 if (open->op_file) 3101 if (open->op_file)
3089 nfsd4_free_file(open->op_file); 3102 nfsd4_free_file(open->op_file);
3090 if (open->op_stp) 3103 if (open->op_stp)
3091 nfs4_free_stateid(open->op_stp); 3104 free_generic_stateid(open->op_stp);
3092} 3105}
3093 3106
3094__be32 3107__be32
@@ -3105,7 +3118,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3105 status = nfserr_stale_clientid; 3118 status = nfserr_stale_clientid;
3106 if (STALE_CLIENTID(clid, nn)) 3119 if (STALE_CLIENTID(clid, nn))
3107 goto out; 3120 goto out;
3108 clp = find_confirmed_client(clid); 3121 clp = find_confirmed_client(clid, cstate->minorversion);
3109 status = nfserr_expired; 3122 status = nfserr_expired;
3110 if (clp == NULL) { 3123 if (clp == NULL) {
3111 /* We assume the client took too long to RENEW. */ 3124 /* We assume the client took too long to RENEW. */
@@ -3181,7 +3194,6 @@ nfs4_laundromat(void)
3181 clp = list_entry(pos, struct nfs4_client, cl_lru); 3194 clp = list_entry(pos, struct nfs4_client, cl_lru);
3182 dprintk("NFSD: purging unused client (clientid %08x)\n", 3195 dprintk("NFSD: purging unused client (clientid %08x)\n",
3183 clp->cl_clientid.cl_id); 3196 clp->cl_clientid.cl_id);
3184 nfsd4_client_record_remove(clp);
3185 expire_client(clp); 3197 expire_client(clp);
3186 } 3198 }
3187 spin_lock(&recall_lock); 3199 spin_lock(&recall_lock);
@@ -3373,7 +3385,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3373 return nfs_ok; 3385 return nfs_ok;
3374} 3386}
3375 3387
3376static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) 3388static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
3377{ 3389{
3378 struct nfs4_client *cl; 3390 struct nfs4_client *cl;
3379 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 3391 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -3382,7 +3394,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s
3382 return nfserr_bad_stateid; 3394 return nfserr_bad_stateid;
3383 if (STALE_STATEID(stateid, nn)) 3395 if (STALE_STATEID(stateid, nn))
3384 return nfserr_stale_stateid; 3396 return nfserr_stale_stateid;
3385 cl = find_confirmed_client(&stateid->si_opaque.so_clid); 3397 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
3386 if (!cl) 3398 if (!cl)
3387 return nfserr_expired; 3399 return nfserr_expired;
3388 *s = find_stateid_by_type(cl, stateid, typemask); 3400 *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3415,7 +3427,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3415 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3416 return check_special_stateids(net, current_fh, stateid, flags); 3428 return check_special_stateids(net, current_fh, stateid, flags);
3417 3429
3418 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); 3430 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
3419 if (status) 3431 if (status)
3420 return status; 3432 return status;
3421 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); 3433 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3565,7 +3577,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3565 seqid, STATEID_VAL(stateid)); 3577 seqid, STATEID_VAL(stateid));
3566 3578
3567 *stpp = NULL; 3579 *stpp = NULL;
3568 status = nfsd4_lookup_stateid(stateid, typemask, &s); 3580 status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
3569 if (status) 3581 if (status)
3570 return status; 3582 return status;
3571 *stpp = openlockstateid(s); 3583 *stpp = openlockstateid(s);
@@ -3766,6 +3778,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3766 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 3778 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3767 3779
3768 nfsd4_close_open_stateid(stp); 3780 nfsd4_close_open_stateid(stp);
3781 release_last_closed_stateid(oo);
3769 oo->oo_last_closed_stid = stp; 3782 oo->oo_last_closed_stid = stp;
3770 3783
3771 if (list_empty(&oo->oo_owner.so_stateids)) { 3784 if (list_empty(&oo->oo_owner.so_stateids)) {
@@ -3802,7 +3815,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3802 inode = cstate->current_fh.fh_dentry->d_inode; 3815 inode = cstate->current_fh.fh_dentry->d_inode;
3803 3816
3804 nfs4_lock_state(); 3817 nfs4_lock_state();
3805 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); 3818 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
3806 if (status) 3819 if (status)
3807 goto out; 3820 goto out;
3808 dp = delegstateid(s); 3821 dp = delegstateid(s);
@@ -4046,8 +4059,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4046 struct nfs4_lockowner *lock_sop = NULL; 4059 struct nfs4_lockowner *lock_sop = NULL;
4047 struct nfs4_ol_stateid *lock_stp; 4060 struct nfs4_ol_stateid *lock_stp;
4048 struct file *filp = NULL; 4061 struct file *filp = NULL;
4049 struct file_lock file_lock; 4062 struct file_lock *file_lock = NULL;
4050 struct file_lock conflock; 4063 struct file_lock *conflock = NULL;
4051 __be32 status = 0; 4064 __be32 status = 0;
4052 bool new_state = false; 4065 bool new_state = false;
4053 int lkflg; 4066 int lkflg;
@@ -4117,21 +4130,28 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4117 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) 4130 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
4118 goto out; 4131 goto out;
4119 4132
4120 locks_init_lock(&file_lock); 4133 file_lock = locks_alloc_lock();
4134 if (!file_lock) {
4135 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4136 status = nfserr_jukebox;
4137 goto out;
4138 }
4139
4140 locks_init_lock(file_lock);
4121 switch (lock->lk_type) { 4141 switch (lock->lk_type) {
4122 case NFS4_READ_LT: 4142 case NFS4_READ_LT:
4123 case NFS4_READW_LT: 4143 case NFS4_READW_LT:
4124 filp = find_readable_file(lock_stp->st_file); 4144 filp = find_readable_file(lock_stp->st_file);
4125 if (filp) 4145 if (filp)
4126 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); 4146 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
4127 file_lock.fl_type = F_RDLCK; 4147 file_lock->fl_type = F_RDLCK;
4128 break; 4148 break;
4129 case NFS4_WRITE_LT: 4149 case NFS4_WRITE_LT:
4130 case NFS4_WRITEW_LT: 4150 case NFS4_WRITEW_LT:
4131 filp = find_writeable_file(lock_stp->st_file); 4151 filp = find_writeable_file(lock_stp->st_file);
4132 if (filp) 4152 if (filp)
4133 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); 4153 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
4134 file_lock.fl_type = F_WRLCK; 4154 file_lock->fl_type = F_WRLCK;
4135 break; 4155 break;
4136 default: 4156 default:
4137 status = nfserr_inval; 4157 status = nfserr_inval;
@@ -4141,22 +4161,23 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4141 status = nfserr_openmode; 4161 status = nfserr_openmode;
4142 goto out; 4162 goto out;
4143 } 4163 }
4144 file_lock.fl_owner = (fl_owner_t)lock_sop; 4164 file_lock->fl_owner = (fl_owner_t)lock_sop;
4145 file_lock.fl_pid = current->tgid; 4165 file_lock->fl_pid = current->tgid;
4146 file_lock.fl_file = filp; 4166 file_lock->fl_file = filp;
4147 file_lock.fl_flags = FL_POSIX; 4167 file_lock->fl_flags = FL_POSIX;
4148 file_lock.fl_lmops = &nfsd_posix_mng_ops; 4168 file_lock->fl_lmops = &nfsd_posix_mng_ops;
4149 4169 file_lock->fl_start = lock->lk_offset;
4150 file_lock.fl_start = lock->lk_offset; 4170 file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
4151 file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); 4171 nfs4_transform_lock_offset(file_lock);
4152 nfs4_transform_lock_offset(&file_lock); 4172
4153 4173 conflock = locks_alloc_lock();
4154 /* 4174 if (!conflock) {
4155 * Try to lock the file in the VFS. 4175 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4156 * Note: locks.c uses the BKL to protect the inode's lock list. 4176 status = nfserr_jukebox;
4157 */ 4177 goto out;
4178 }
4158 4179
4159 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); 4180 err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
4160 switch (-err) { 4181 switch (-err) {
4161 case 0: /* success! */ 4182 case 0: /* success! */
4162 update_stateid(&lock_stp->st_stid.sc_stateid); 4183 update_stateid(&lock_stp->st_stid.sc_stateid);
@@ -4167,7 +4188,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4167 case (EAGAIN): /* conflock holds conflicting lock */ 4188 case (EAGAIN): /* conflock holds conflicting lock */
4168 status = nfserr_denied; 4189 status = nfserr_denied;
4169 dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); 4190 dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
4170 nfs4_set_lock_denied(&conflock, &lock->lk_denied); 4191 nfs4_set_lock_denied(conflock, &lock->lk_denied);
4171 break; 4192 break;
4172 case (EDEADLK): 4193 case (EDEADLK):
4173 status = nfserr_deadlock; 4194 status = nfserr_deadlock;
@@ -4182,6 +4203,10 @@ out:
4182 release_lockowner(lock_sop); 4203 release_lockowner(lock_sop);
4183 if (!cstate->replay_owner) 4204 if (!cstate->replay_owner)
4184 nfs4_unlock_state(); 4205 nfs4_unlock_state();
4206 if (file_lock)
4207 locks_free_lock(file_lock);
4208 if (conflock)
4209 locks_free_lock(conflock);
4185 return status; 4210 return status;
4186} 4211}
4187 4212
@@ -4210,7 +4235,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4210 struct nfsd4_lockt *lockt) 4235 struct nfsd4_lockt *lockt)
4211{ 4236{
4212 struct inode *inode; 4237 struct inode *inode;
4213 struct file_lock file_lock; 4238 struct file_lock *file_lock = NULL;
4214 struct nfs4_lockowner *lo; 4239 struct nfs4_lockowner *lo;
4215 __be32 status; 4240 __be32 status;
4216 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4241 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -4231,15 +4256,21 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4231 goto out; 4256 goto out;
4232 4257
4233 inode = cstate->current_fh.fh_dentry->d_inode; 4258 inode = cstate->current_fh.fh_dentry->d_inode;
4234 locks_init_lock(&file_lock); 4259 file_lock = locks_alloc_lock();
4260 if (!file_lock) {
4261 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4262 status = nfserr_jukebox;
4263 goto out;
4264 }
4265 locks_init_lock(file_lock);
4235 switch (lockt->lt_type) { 4266 switch (lockt->lt_type) {
4236 case NFS4_READ_LT: 4267 case NFS4_READ_LT:
4237 case NFS4_READW_LT: 4268 case NFS4_READW_LT:
4238 file_lock.fl_type = F_RDLCK; 4269 file_lock->fl_type = F_RDLCK;
4239 break; 4270 break;
4240 case NFS4_WRITE_LT: 4271 case NFS4_WRITE_LT:
4241 case NFS4_WRITEW_LT: 4272 case NFS4_WRITEW_LT:
4242 file_lock.fl_type = F_WRLCK; 4273 file_lock->fl_type = F_WRLCK;
4243 break; 4274 break;
4244 default: 4275 default:
4245 dprintk("NFSD: nfs4_lockt: bad lock type!\n"); 4276 dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -4249,25 +4280,27 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4249 4280
4250 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); 4281 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
4251 if (lo) 4282 if (lo)
4252 file_lock.fl_owner = (fl_owner_t)lo; 4283 file_lock->fl_owner = (fl_owner_t)lo;
4253 file_lock.fl_pid = current->tgid; 4284 file_lock->fl_pid = current->tgid;
4254 file_lock.fl_flags = FL_POSIX; 4285 file_lock->fl_flags = FL_POSIX;
4255 4286
4256 file_lock.fl_start = lockt->lt_offset; 4287 file_lock->fl_start = lockt->lt_offset;
4257 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); 4288 file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
4258 4289
4259 nfs4_transform_lock_offset(&file_lock); 4290 nfs4_transform_lock_offset(file_lock);
4260 4291
4261 status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); 4292 status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock);
4262 if (status) 4293 if (status)
4263 goto out; 4294 goto out;
4264 4295
4265 if (file_lock.fl_type != F_UNLCK) { 4296 if (file_lock->fl_type != F_UNLCK) {
4266 status = nfserr_denied; 4297 status = nfserr_denied;
4267 nfs4_set_lock_denied(&file_lock, &lockt->lt_denied); 4298 nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
4268 } 4299 }
4269out: 4300out:
4270 nfs4_unlock_state(); 4301 nfs4_unlock_state();
4302 if (file_lock)
4303 locks_free_lock(file_lock);
4271 return status; 4304 return status;
4272} 4305}
4273 4306
@@ -4277,7 +4310,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4277{ 4310{
4278 struct nfs4_ol_stateid *stp; 4311 struct nfs4_ol_stateid *stp;
4279 struct file *filp = NULL; 4312 struct file *filp = NULL;
4280 struct file_lock file_lock; 4313 struct file_lock *file_lock = NULL;
4281 __be32 status; 4314 __be32 status;
4282 int err; 4315 int err;
4283 4316
@@ -4299,23 +4332,29 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4299 status = nfserr_lock_range; 4332 status = nfserr_lock_range;
4300 goto out; 4333 goto out;
4301 } 4334 }
4302 BUG_ON(!filp); 4335 file_lock = locks_alloc_lock();
4303 locks_init_lock(&file_lock); 4336 if (!file_lock) {
4304 file_lock.fl_type = F_UNLCK; 4337 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4305 file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); 4338 status = nfserr_jukebox;
4306 file_lock.fl_pid = current->tgid; 4339 goto out;
4307 file_lock.fl_file = filp; 4340 }
4308 file_lock.fl_flags = FL_POSIX; 4341 locks_init_lock(file_lock);
4309 file_lock.fl_lmops = &nfsd_posix_mng_ops; 4342 file_lock->fl_type = F_UNLCK;
4310 file_lock.fl_start = locku->lu_offset; 4343 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4311 4344 file_lock->fl_pid = current->tgid;
4312 file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); 4345 file_lock->fl_file = filp;
4313 nfs4_transform_lock_offset(&file_lock); 4346 file_lock->fl_flags = FL_POSIX;
4347 file_lock->fl_lmops = &nfsd_posix_mng_ops;
4348 file_lock->fl_start = locku->lu_offset;
4349
4350 file_lock->fl_end = last_byte_offset(locku->lu_offset,
4351 locku->lu_length);
4352 nfs4_transform_lock_offset(file_lock);
4314 4353
4315 /* 4354 /*
4316 * Try to unlock the file in the VFS. 4355 * Try to unlock the file in the VFS.
4317 */ 4356 */
4318 err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL); 4357 err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
4319 if (err) { 4358 if (err) {
4320 dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); 4359 dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
4321 goto out_nfserr; 4360 goto out_nfserr;
@@ -4329,6 +4368,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4329out: 4368out:
4330 if (!cstate->replay_owner) 4369 if (!cstate->replay_owner)
4331 nfs4_unlock_state(); 4370 nfs4_unlock_state();
4371 if (file_lock)
4372 locks_free_lock(file_lock);
4332 return status; 4373 return status;
4333 4374
4334out_nfserr: 4375out_nfserr:
@@ -4502,12 +4543,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
4502* Called from OPEN. Look for clientid in reclaim list. 4543* Called from OPEN. Look for clientid in reclaim list.
4503*/ 4544*/
4504__be32 4545__be32
4505nfs4_check_open_reclaim(clientid_t *clid) 4546nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
4506{ 4547{
4507 struct nfs4_client *clp; 4548 struct nfs4_client *clp;
4508 4549
4509 /* find clientid in conf_id_hashtbl */ 4550 /* find clientid in conf_id_hashtbl */
4510 clp = find_confirmed_client(clid); 4551 clp = find_confirmed_client(clid, sessions);
4511 if (clp == NULL) 4552 if (clp == NULL)
4512 return nfserr_reclaim_bad; 4553 return nfserr_reclaim_bad;
4513 4554
@@ -4523,7 +4564,6 @@ void nfsd_forget_clients(u64 num)
4523 4564
4524 nfs4_lock_state(); 4565 nfs4_lock_state();
4525 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { 4566 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4526 nfsd4_client_record_remove(clp);
4527 expire_client(clp); 4567 expire_client(clp);
4528 if (++count == num) 4568 if (++count == num)
4529 break; 4569 break;
@@ -4583,7 +4623,7 @@ void nfsd_forget_openowners(u64 num)
4583 printk(KERN_INFO "NFSD: Forgot %d open owners", count); 4623 printk(KERN_INFO "NFSD: Forgot %d open owners", count);
4584} 4624}
4585 4625
4586int nfsd_process_n_delegations(u64 num, struct list_head *list) 4626static int nfsd_process_n_delegations(u64 num, struct list_head *list)
4587{ 4627{
4588 int i, count = 0; 4628 int i, count = 0;
4589 struct nfs4_file *fp, *fnext; 4629 struct nfs4_file *fp, *fnext;
@@ -4748,11 +4788,11 @@ __nfs4_state_shutdown(void)
4748 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4788 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4749 while (!list_empty(&conf_id_hashtbl[i])) { 4789 while (!list_empty(&conf_id_hashtbl[i])) {
4750 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); 4790 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4751 expire_client(clp); 4791 destroy_client(clp);
4752 } 4792 }
4753 while (!list_empty(&unconf_str_hashtbl[i])) { 4793 while (!list_empty(&unconf_str_hashtbl[i])) {
4754 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash); 4794 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
4755 expire_client(clp); 4795 destroy_client(clp);
4756 } 4796 }
4757 } 4797 }
4758 INIT_LIST_HEAD(&reaplist); 4798 INIT_LIST_HEAD(&reaplist);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6322df36031f..fd548d155088 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2659,7 +2659,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
2659 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); 2659 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2660 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); 2660 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2661 WRITE32(bcts->dir); 2661 WRITE32(bcts->dir);
2662 /* XXX: ? */ 2662 /* Sorry, we do not yet support RDMA over 4.1: */
2663 WRITE32(0); 2663 WRITE32(0);
2664 ADJUST_ARGS(); 2664 ADJUST_ARGS();
2665 } 2665 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index fa49cff5ee65..dab350dfc376 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -406,7 +406,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
406 return rv; 406 return rv;
407 if (newthreads < 0) 407 if (newthreads < 0)
408 return -EINVAL; 408 return -EINVAL;
409 rv = nfsd_svc(NFS_PORT, newthreads); 409 rv = nfsd_svc(newthreads);
410 if (rv < 0) 410 if (rv < 0)
411 return rv; 411 return rv;
412 } else 412 } else
@@ -683,25 +683,6 @@ static ssize_t __write_ports_addfd(char *buf)
683} 683}
684 684
685/* 685/*
686 * A '-' followed by the 'name' of a socket means we close the socket.
687 */
688static ssize_t __write_ports_delfd(char *buf)
689{
690 char *toclose;
691 int len = 0;
692
693 toclose = kstrdup(buf + 1, GFP_KERNEL);
694 if (toclose == NULL)
695 return -ENOMEM;
696
697 if (nfsd_serv != NULL)
698 len = svc_sock_names(nfsd_serv, buf,
699 SIMPLE_TRANSACTION_LIMIT, toclose);
700 kfree(toclose);
701 return len;
702}
703
704/*
705 * A transport listener is added by writing it's transport name and 686 * A transport listener is added by writing it's transport name and
706 * a port number. 687 * a port number.
707 */ 688 */
@@ -712,7 +693,7 @@ static ssize_t __write_ports_addxprt(char *buf)
712 int port, err; 693 int port, err;
713 struct net *net = &init_net; 694 struct net *net = &init_net;
714 695
715 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 696 if (sscanf(buf, "%15s %5u", transport, &port) != 2)
716 return -EINVAL; 697 return -EINVAL;
717 698
718 if (port < 1 || port > USHRT_MAX) 699 if (port < 1 || port > USHRT_MAX)
@@ -746,31 +727,6 @@ out_err:
746 return err; 727 return err;
747} 728}
748 729
749/*
750 * A transport listener is removed by writing a "-", it's transport
751 * name, and it's port number.
752 */
753static ssize_t __write_ports_delxprt(char *buf)
754{
755 struct svc_xprt *xprt;
756 char transport[16];
757 int port;
758
759 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
760 return -EINVAL;
761
762 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
763 return -EINVAL;
764
765 xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
766 if (xprt == NULL)
767 return -ENOTCONN;
768
769 svc_close_xprt(xprt);
770 svc_xprt_put(xprt);
771 return 0;
772}
773
774static ssize_t __write_ports(struct file *file, char *buf, size_t size) 730static ssize_t __write_ports(struct file *file, char *buf, size_t size)
775{ 731{
776 if (size == 0) 732 if (size == 0)
@@ -779,15 +735,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
779 if (isdigit(buf[0])) 735 if (isdigit(buf[0]))
780 return __write_ports_addfd(buf); 736 return __write_ports_addfd(buf);
781 737
782 if (buf[0] == '-' && isdigit(buf[1]))
783 return __write_ports_delfd(buf);
784
785 if (isalpha(buf[0])) 738 if (isalpha(buf[0]))
786 return __write_ports_addxprt(buf); 739 return __write_ports_addxprt(buf);
787 740
788 if (buf[0] == '-' && isalpha(buf[1]))
789 return __write_ports_delxprt(buf);
790
791 return -EINVAL; 741 return -EINVAL;
792} 742}
793 743
@@ -825,21 +775,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
825 * OR 775 * OR
826 * 776 *
827 * Input: 777 * Input:
828 * buf: C string containing a "-" followed
829 * by an integer value representing a
830 * previously passed in socket file
831 * descriptor
832 * size: non-zero length of C string in @buf
833 * Output:
834 * On success: NFS service no longer listens on that socket;
835 * passed-in buffer filled with a '\n'-terminated C
836 * string containing a unique name of the listener;
837 * return code is the size in bytes of the string
838 * On error: return code is a negative errno value
839 *
840 * OR
841 *
842 * Input:
843 * buf: C string containing a transport 778 * buf: C string containing a transport
844 * name and an unsigned integer value 779 * name and an unsigned integer value
845 * representing the port to listen on, 780 * representing the port to listen on,
@@ -848,19 +783,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
848 * Output: 783 * Output:
849 * On success: returns zero; NFS service is started 784 * On success: returns zero; NFS service is started
850 * On error: return code is a negative errno value 785 * On error: return code is a negative errno value
851 *
852 * OR
853 *
854 * Input:
855 * buf: C string containing a "-" followed
856 * by a transport name and an unsigned
857 * integer value representing the port
858 * to listen on, separated by whitespace
859 * size: non-zero length of C string in @buf
860 * Output:
861 * On success: returns zero; NFS service no longer listens
862 * on that transport
863 * On error: return code is a negative errno value
864 */ 786 */
865static ssize_t write_ports(struct file *file, char *buf, size_t size) 787static ssize_t write_ports(struct file *file, char *buf, size_t size)
866{ 788{
@@ -1008,8 +930,6 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
1008 return nfsd4_write_time(file, buf, size, &nfsd4_grace); 930 return nfsd4_write_time(file, buf, size, &nfsd4_grace);
1009} 931}
1010 932
1011extern char *nfs4_recoverydir(void);
1012
1013static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) 933static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1014{ 934{
1015 char *mesg = buf; 935 char *mesg = buf;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2244222368ab..80d5ce40aadb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -65,7 +65,7 @@ extern const struct seq_operations nfs_exports_op;
65/* 65/*
66 * Function prototypes. 66 * Function prototypes.
67 */ 67 */
68int nfsd_svc(unsigned short port, int nrservs); 68int nfsd_svc(int nrservs);
69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); 69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
70 70
71int nfsd_nrthreads(void); 71int nfsd_nrthreads(void);
@@ -124,6 +124,7 @@ int nfs4_state_start(void);
124void nfs4_state_shutdown(void); 124void nfs4_state_shutdown(void);
125void nfs4_reset_lease(time_t leasetime); 125void nfs4_reset_lease(time_t leasetime);
126int nfs4_reset_recoverydir(char *recdir); 126int nfs4_reset_recoverydir(char *recdir);
127char * nfs4_recoverydir(void);
127#else 128#else
128static inline void nfs4_state_init(void) { } 129static inline void nfs4_state_init(void) { }
129static inline int nfsd4_init_slabs(void) { return 0; } 130static inline int nfsd4_init_slabs(void) { return 0; }
@@ -132,6 +133,7 @@ static inline int nfs4_state_start(void) { return 0; }
132static inline void nfs4_state_shutdown(void) { } 133static inline void nfs4_state_shutdown(void) { }
133static inline void nfs4_reset_lease(time_t leasetime) { } 134static inline void nfs4_reset_lease(time_t leasetime) { }
134static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 135static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
136static inline char * nfs4_recoverydir(void) {return NULL; }
135#endif 137#endif
136 138
137/* 139/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 240473cb708f..2013aa001dab 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -183,18 +183,18 @@ int nfsd_nrthreads(void)
183 return rv; 183 return rv;
184} 184}
185 185
186static int nfsd_init_socks(int port) 186static int nfsd_init_socks(void)
187{ 187{
188 int error; 188 int error;
189 if (!list_empty(&nfsd_serv->sv_permsocks)) 189 if (!list_empty(&nfsd_serv->sv_permsocks))
190 return 0; 190 return 0;
191 191
192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port, 192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
193 SVC_SOCK_DEFAULTS); 193 SVC_SOCK_DEFAULTS);
194 if (error < 0) 194 if (error < 0)
195 return error; 195 return error;
196 196
197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port, 197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
198 SVC_SOCK_DEFAULTS); 198 SVC_SOCK_DEFAULTS);
199 if (error < 0) 199 if (error < 0)
200 return error; 200 return error;
@@ -204,7 +204,7 @@ static int nfsd_init_socks(int port)
204 204
205static bool nfsd_up = false; 205static bool nfsd_up = false;
206 206
207static int nfsd_startup(unsigned short port, int nrservs) 207static int nfsd_startup(int nrservs)
208{ 208{
209 int ret; 209 int ret;
210 210
@@ -218,7 +218,7 @@ static int nfsd_startup(unsigned short port, int nrservs)
218 ret = nfsd_racache_init(2*nrservs); 218 ret = nfsd_racache_init(2*nrservs);
219 if (ret) 219 if (ret)
220 return ret; 220 return ret;
221 ret = nfsd_init_socks(port); 221 ret = nfsd_init_socks();
222 if (ret) 222 if (ret)
223 goto out_racache; 223 goto out_racache;
224 ret = lockd_up(&init_net); 224 ret = lockd_up(&init_net);
@@ -436,7 +436,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
436 * this is the first time nrservs is nonzero. 436 * this is the first time nrservs is nonzero.
437 */ 437 */
438int 438int
439nfsd_svc(unsigned short port, int nrservs) 439nfsd_svc(int nrservs)
440{ 440{
441 int error; 441 int error;
442 bool nfsd_up_before; 442 bool nfsd_up_before;
@@ -458,7 +458,7 @@ nfsd_svc(unsigned short port, int nrservs)
458 458
459 nfsd_up_before = nfsd_up; 459 nfsd_up_before = nfsd_up;
460 460
461 error = nfsd_startup(port, nrservs); 461 error = nfsd_startup(nrservs);
462 if (error) 462 if (error)
463 goto out_destroy; 463 goto out_destroy;
464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
@@ -487,7 +487,7 @@ static int
487nfsd(void *vrqstp) 487nfsd(void *vrqstp)
488{ 488{
489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
490 int err, preverr = 0; 490 int err;
491 491
492 /* Lock module and set up kernel thread */ 492 /* Lock module and set up kernel thread */
493 mutex_lock(&nfsd_mutex); 493 mutex_lock(&nfsd_mutex);
@@ -534,16 +534,6 @@ nfsd(void *vrqstp)
534 ; 534 ;
535 if (err == -EINTR) 535 if (err == -EINTR)
536 break; 536 break;
537 else if (err < 0) {
538 if (err != preverr) {
539 printk(KERN_WARNING "%s: unexpected error "
540 "from svc_recv (%d)\n", __func__, -err);
541 preverr = err;
542 }
543 schedule_timeout_uninterruptible(HZ);
544 continue;
545 }
546
547 validate_process_creds(); 537 validate_process_creds();
548 svc_process(rqstp); 538 svc_process(rqstp);
549 validate_process_creds(); 539 validate_process_creds();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 22bd0a66c356..e036894bce57 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -373,11 +373,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
373 return container_of(so, struct nfs4_lockowner, lo_owner); 373 return container_of(so, struct nfs4_lockowner, lo_owner);
374} 374}
375 375
376/* 376/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
377* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
378* o fi_perfile list is used to search for conflicting
379* share_acces, share_deny on the file.
380*/
381struct nfs4_file { 377struct nfs4_file {
382 atomic_t fi_ref; 378 atomic_t fi_ref;
383 struct list_head fi_hash; /* hash by "struct inode *" */ 379 struct list_head fi_hash; /* hash by "struct inode *" */
@@ -459,7 +455,7 @@ extern void nfs4_unlock_state(void);
459extern int nfs4_in_grace(void); 455extern int nfs4_in_grace(void);
460extern void nfs4_release_reclaim(void); 456extern void nfs4_release_reclaim(void);
461extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); 457extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
462extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 458extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
463extern void nfs4_free_openowner(struct nfs4_openowner *); 459extern void nfs4_free_openowner(struct nfs4_openowner *);
464extern void nfs4_free_lockowner(struct nfs4_lockowner *); 460extern void nfs4_free_lockowner(struct nfs4_lockowner *);
465extern int set_callback_cred(void); 461extern int set_callback_cred(void);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a9269f142cc4..c120b48ec305 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -480,7 +480,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
480 if (buf == NULL) 480 if (buf == NULL)
481 goto out; 481 goto out;
482 482
483 len = posix_acl_to_xattr(pacl, buf, buflen); 483 len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen);
484 if (len < 0) { 484 if (len < 0) {
485 error = len; 485 error = len;
486 goto out; 486 goto out;
@@ -549,7 +549,7 @@ _get_posix_acl(struct dentry *dentry, char *key)
549 if (buflen <= 0) 549 if (buflen <= 0)
550 return ERR_PTR(buflen); 550 return ERR_PTR(buflen);
551 551
552 pacl = posix_acl_from_xattr(buf, buflen); 552 pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen);
553 kfree(buf); 553 kfree(buf);
554 return pacl; 554 return pacl;
555} 555}
@@ -1581,7 +1581,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1581 */ 1581 */
1582 1582
1583 oldfs = get_fs(); set_fs(KERNEL_DS); 1583 oldfs = get_fs(); set_fs(KERNEL_DS);
1584 host_err = inode->i_op->readlink(path.dentry, buf, *lenp); 1584 host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
1585 set_fs(oldfs); 1585 set_fs(oldfs);
1586 1586
1587 if (host_err < 0) 1587 if (host_err < 0)
@@ -2264,7 +2264,7 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type)
2264 if (size < 0) 2264 if (size < 0)
2265 return ERR_PTR(size); 2265 return ERR_PTR(size);
2266 2266
2267 acl = posix_acl_from_xattr(value, size); 2267 acl = posix_acl_from_xattr(&init_user_ns, value, size);
2268 kfree(value); 2268 kfree(value);
2269 return acl; 2269 return acl;
2270} 2270}
@@ -2297,7 +2297,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2297 value = kmalloc(size, GFP_KERNEL); 2297 value = kmalloc(size, GFP_KERNEL);
2298 if (!value) 2298 if (!value)
2299 return -ENOMEM; 2299 return -ENOMEM;
2300 error = posix_acl_to_xattr(acl, value, size); 2300 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
2301 if (error < 0) 2301 if (error < 0)
2302 goto getout; 2302 goto getout;
2303 size = error; 2303 size = error;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a4d56ac02e6c..16f35f7423c5 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
116 if (unlikely(ret)) 116 if (unlikely(ret))
117 goto out; 117 goto out;
118 118
119 file_update_time(vma->vm_file);
119 ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); 120 ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
120 if (ret) { 121 if (ret) {
121 nilfs_transaction_abort(inode->i_sb); 122 nilfs_transaction_abort(inode->i_sb);
@@ -134,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
134static const struct vm_operations_struct nilfs_file_vm_ops = { 135static const struct vm_operations_struct nilfs_file_vm_ops = {
135 .fault = filemap_fault, 136 .fault = filemap_fault,
136 .page_mkwrite = nilfs_page_mkwrite, 137 .page_mkwrite = nilfs_page_mkwrite,
138 .remap_pages = generic_file_remap_pages,
137}; 139};
138 140
139static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) 141static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
140{ 142{
141 file_accessed(file); 143 file_accessed(file);
142 vma->vm_ops = &nilfs_file_vm_ops; 144 vma->vm_ops = &nilfs_file_vm_ops;
143 vma->vm_flags |= VM_CAN_NONLINEAR;
144 return 0; 145 return 0;
145} 146}
146 147
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6e2c3db976b2..4d31d2cca7fd 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -401,8 +401,8 @@ int nilfs_read_inode_common(struct inode *inode,
401 int err; 401 int err;
402 402
403 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 403 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
404 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); 404 i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
405 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); 405 i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
406 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 406 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
407 inode->i_size = le64_to_cpu(raw_inode->i_size); 407 inode->i_size = le64_to_cpu(raw_inode->i_size);
408 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 408 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -590,8 +590,8 @@ void nilfs_write_inode_common(struct inode *inode,
590 struct nilfs_inode_info *ii = NILFS_I(inode); 590 struct nilfs_inode_info *ii = NILFS_I(inode);
591 591
592 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 592 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
593 raw_inode->i_uid = cpu_to_le32(inode->i_uid); 593 raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
594 raw_inode->i_gid = cpu_to_le32(inode->i_gid); 594 raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
595 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 595 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
596 raw_inode->i_size = cpu_to_le64(inode->i_size); 596 raw_inode->i_size = cpu_to_le64(inode->i_size);
597 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 597 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6a10812711c1..3c991dc84f2f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1382,6 +1382,12 @@ static void nilfs_segbuf_init_once(void *obj)
1382 1382
1383static void nilfs_destroy_cachep(void) 1383static void nilfs_destroy_cachep(void)
1384{ 1384{
1385 /*
1386 * Make sure all delayed rcu free inodes are flushed before we
1387 * destroy cache.
1388 */
1389 rcu_barrier();
1390
1385 if (nilfs_inode_cachep) 1391 if (nilfs_inode_cachep)
1386 kmem_cache_destroy(nilfs_inode_cachep); 1392 kmem_cache_destroy(nilfs_inode_cachep);
1387 if (nilfs_transaction_cachep) 1393 if (nilfs_transaction_cachep)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d43803669739..721d692fa8d4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -58,7 +58,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
58 return fsnotify_remove_notify_event(group); 58 return fsnotify_remove_notify_event(group);
59} 59}
60 60
61static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) 61static int create_fd(struct fsnotify_group *group,
62 struct fsnotify_event *event,
63 struct file **file)
62{ 64{
63 int client_fd; 65 int client_fd;
64 struct file *new_file; 66 struct file *new_file;
@@ -98,7 +100,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
98 put_unused_fd(client_fd); 100 put_unused_fd(client_fd);
99 client_fd = PTR_ERR(new_file); 101 client_fd = PTR_ERR(new_file);
100 } else { 102 } else {
101 fd_install(client_fd, new_file); 103 *file = new_file;
102 } 104 }
103 105
104 return client_fd; 106 return client_fd;
@@ -106,13 +108,15 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
106 108
107static int fill_event_metadata(struct fsnotify_group *group, 109static int fill_event_metadata(struct fsnotify_group *group,
108 struct fanotify_event_metadata *metadata, 110 struct fanotify_event_metadata *metadata,
109 struct fsnotify_event *event) 111 struct fsnotify_event *event,
112 struct file **file)
110{ 113{
111 int ret = 0; 114 int ret = 0;
112 115
113 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 116 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
114 group, metadata, event); 117 group, metadata, event);
115 118
119 *file = NULL;
116 metadata->event_len = FAN_EVENT_METADATA_LEN; 120 metadata->event_len = FAN_EVENT_METADATA_LEN;
117 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 121 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
118 metadata->vers = FANOTIFY_METADATA_VERSION; 122 metadata->vers = FANOTIFY_METADATA_VERSION;
@@ -121,7 +125,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
121 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 125 if (unlikely(event->mask & FAN_Q_OVERFLOW))
122 metadata->fd = FAN_NOFD; 126 metadata->fd = FAN_NOFD;
123 else { 127 else {
124 metadata->fd = create_fd(group, event); 128 metadata->fd = create_fd(group, event, file);
125 if (metadata->fd < 0) 129 if (metadata->fd < 0)
126 ret = metadata->fd; 130 ret = metadata->fd;
127 } 131 }
@@ -220,25 +224,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,
220 return 0; 224 return 0;
221} 225}
222 226
223static void remove_access_response(struct fsnotify_group *group,
224 struct fsnotify_event *event,
225 __s32 fd)
226{
227 struct fanotify_response_event *re;
228
229 if (!(event->mask & FAN_ALL_PERM_EVENTS))
230 return;
231
232 re = dequeue_re(group, fd);
233 if (!re)
234 return;
235
236 BUG_ON(re->event != event);
237
238 kmem_cache_free(fanotify_response_event_cache, re);
239
240 return;
241}
242#else 227#else
243static int prepare_for_access_response(struct fsnotify_group *group, 228static int prepare_for_access_response(struct fsnotify_group *group,
244 struct fsnotify_event *event, 229 struct fsnotify_event *event,
@@ -247,12 +232,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,
247 return 0; 232 return 0;
248} 233}
249 234
250static void remove_access_response(struct fsnotify_group *group,
251 struct fsnotify_event *event,
252 __s32 fd)
253{
254 return;
255}
256#endif 235#endif
257 236
258static ssize_t copy_event_to_user(struct fsnotify_group *group, 237static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -260,31 +239,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
260 char __user *buf) 239 char __user *buf)
261{ 240{
262 struct fanotify_event_metadata fanotify_event_metadata; 241 struct fanotify_event_metadata fanotify_event_metadata;
242 struct file *f;
263 int fd, ret; 243 int fd, ret;
264 244
265 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 245 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
266 246
267 ret = fill_event_metadata(group, &fanotify_event_metadata, event); 247 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
268 if (ret < 0) 248 if (ret < 0)
269 goto out; 249 goto out;
270 250
271 fd = fanotify_event_metadata.fd; 251 fd = fanotify_event_metadata.fd;
272 ret = prepare_for_access_response(group, event, fd);
273 if (ret)
274 goto out_close_fd;
275
276 ret = -EFAULT; 252 ret = -EFAULT;
277 if (copy_to_user(buf, &fanotify_event_metadata, 253 if (copy_to_user(buf, &fanotify_event_metadata,
278 fanotify_event_metadata.event_len)) 254 fanotify_event_metadata.event_len))
279 goto out_kill_access_response; 255 goto out_close_fd;
280 256
257 ret = prepare_for_access_response(group, event, fd);
258 if (ret)
259 goto out_close_fd;
260
261 fd_install(fd, f);
281 return fanotify_event_metadata.event_len; 262 return fanotify_event_metadata.event_len;
282 263
283out_kill_access_response:
284 remove_access_response(group, event, fd);
285out_close_fd: 264out_close_fd:
286 if (fd != FAN_NOFD) 265 if (fd != FAN_NOFD) {
287 sys_close(fd); 266 put_unused_fd(fd);
267 fput(f);
268 }
288out: 269out:
289#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 270#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
290 if (event->mask & FAN_ALL_PERM_EVENTS) { 271 if (event->mask & FAN_ALL_PERM_EVENTS) {
@@ -470,24 +451,22 @@ static int fanotify_find_path(int dfd, const char __user *filename,
470 dfd, filename, flags); 451 dfd, filename, flags);
471 452
472 if (filename == NULL) { 453 if (filename == NULL) {
473 struct file *file; 454 struct fd f = fdget(dfd);
474 int fput_needed;
475 455
476 ret = -EBADF; 456 ret = -EBADF;
477 file = fget_light(dfd, &fput_needed); 457 if (!f.file)
478 if (!file)
479 goto out; 458 goto out;
480 459
481 ret = -ENOTDIR; 460 ret = -ENOTDIR;
482 if ((flags & FAN_MARK_ONLYDIR) && 461 if ((flags & FAN_MARK_ONLYDIR) &&
483 !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) { 462 !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) {
484 fput_light(file, fput_needed); 463 fdput(f);
485 goto out; 464 goto out;
486 } 465 }
487 466
488 *path = file->f_path; 467 *path = f.file->f_path;
489 path_get(path); 468 path_get(path);
490 fput_light(file, fput_needed); 469 fdput(f);
491 } else { 470 } else {
492 unsigned int lookup_flags = 0; 471 unsigned int lookup_flags = 0;
493 472
@@ -767,9 +746,9 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
767 struct inode *inode = NULL; 746 struct inode *inode = NULL;
768 struct vfsmount *mnt = NULL; 747 struct vfsmount *mnt = NULL;
769 struct fsnotify_group *group; 748 struct fsnotify_group *group;
770 struct file *filp; 749 struct fd f;
771 struct path path; 750 struct path path;
772 int ret, fput_needed; 751 int ret;
773 752
774 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", 753 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
775 __func__, fanotify_fd, flags, dfd, pathname, mask); 754 __func__, fanotify_fd, flags, dfd, pathname, mask);
@@ -803,15 +782,15 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
803#endif 782#endif
804 return -EINVAL; 783 return -EINVAL;
805 784
806 filp = fget_light(fanotify_fd, &fput_needed); 785 f = fdget(fanotify_fd);
807 if (unlikely(!filp)) 786 if (unlikely(!f.file))
808 return -EBADF; 787 return -EBADF;
809 788
810 /* verify that this is indeed an fanotify instance */ 789 /* verify that this is indeed an fanotify instance */
811 ret = -EINVAL; 790 ret = -EINVAL;
812 if (unlikely(filp->f_op != &fanotify_fops)) 791 if (unlikely(f.file->f_op != &fanotify_fops))
813 goto fput_and_out; 792 goto fput_and_out;
814 group = filp->private_data; 793 group = f.file->private_data;
815 794
816 /* 795 /*
817 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not 796 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
@@ -858,7 +837,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
858 837
859 path_put(&path); 838 path_put(&path);
860fput_and_out: 839fput_and_out:
861 fput_light(filp, fput_needed); 840 fdput(f);
862 return ret; 841 return ret;
863} 842}
864 843
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 8445fbc8985c..c311dda054a3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -757,16 +757,16 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
757 struct fsnotify_group *group; 757 struct fsnotify_group *group;
758 struct inode *inode; 758 struct inode *inode;
759 struct path path; 759 struct path path;
760 struct file *filp; 760 struct fd f;
761 int ret, fput_needed; 761 int ret;
762 unsigned flags = 0; 762 unsigned flags = 0;
763 763
764 filp = fget_light(fd, &fput_needed); 764 f = fdget(fd);
765 if (unlikely(!filp)) 765 if (unlikely(!f.file))
766 return -EBADF; 766 return -EBADF;
767 767
768 /* verify that this is indeed an inotify instance */ 768 /* verify that this is indeed an inotify instance */
769 if (unlikely(filp->f_op != &inotify_fops)) { 769 if (unlikely(f.file->f_op != &inotify_fops)) {
770 ret = -EINVAL; 770 ret = -EINVAL;
771 goto fput_and_out; 771 goto fput_and_out;
772 } 772 }
@@ -782,13 +782,13 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
782 782
783 /* inode held in place by reference to path; group by fget on fd */ 783 /* inode held in place by reference to path; group by fget on fd */
784 inode = path.dentry->d_inode; 784 inode = path.dentry->d_inode;
785 group = filp->private_data; 785 group = f.file->private_data;
786 786
787 /* create/update an inode mark */ 787 /* create/update an inode mark */
788 ret = inotify_update_watch(group, inode, mask); 788 ret = inotify_update_watch(group, inode, mask);
789 path_put(&path); 789 path_put(&path);
790fput_and_out: 790fput_and_out:
791 fput_light(filp, fput_needed); 791 fdput(f);
792 return ret; 792 return ret;
793} 793}
794 794
@@ -796,19 +796,19 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
796{ 796{
797 struct fsnotify_group *group; 797 struct fsnotify_group *group;
798 struct inotify_inode_mark *i_mark; 798 struct inotify_inode_mark *i_mark;
799 struct file *filp; 799 struct fd f;
800 int ret = 0, fput_needed; 800 int ret = 0;
801 801
802 filp = fget_light(fd, &fput_needed); 802 f = fdget(fd);
803 if (unlikely(!filp)) 803 if (unlikely(!f.file))
804 return -EBADF; 804 return -EBADF;
805 805
806 /* verify that this is indeed an inotify instance */ 806 /* verify that this is indeed an inotify instance */
807 ret = -EINVAL; 807 ret = -EINVAL;
808 if (unlikely(filp->f_op != &inotify_fops)) 808 if (unlikely(f.file->f_op != &inotify_fops))
809 goto out; 809 goto out;
810 810
811 group = filp->private_data; 811 group = f.file->private_data;
812 812
813 ret = -EINVAL; 813 ret = -EINVAL;
814 i_mark = inotify_idr_find(group, wd); 814 i_mark = inotify_idr_find(group, wd);
@@ -823,7 +823,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
823 fsnotify_put_mark(&i_mark->fsn_mark); 823 fsnotify_put_mark(&i_mark->fsn_mark);
824 824
825out: 825out:
826 fput_light(filp, fput_needed); 826 fdput(f);
827 return ret; 827 return ret;
828} 828}
829 829
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index c6dbd3db6ca8..1d27331e6fc9 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2124,7 +2124,8 @@ int ntfs_read_inode_mount(struct inode *vi)
2124 * ntfs_read_inode() will have set up the default ones. 2124 * ntfs_read_inode() will have set up the default ones.
2125 */ 2125 */
2126 /* Set uid and gid to root. */ 2126 /* Set uid and gid to root. */
2127 vi->i_uid = vi->i_gid = 0; 2127 vi->i_uid = GLOBAL_ROOT_UID;
2128 vi->i_gid = GLOBAL_ROOT_GID;
2128 /* Regular file. No access for anyone. */ 2129 /* Regular file. No access for anyone. */
2129 vi->i_mode = S_IFREG; 2130 vi->i_mode = S_IFREG;
2130 /* No VFS initiated operations allowed for $MFT. */ 2131 /* No VFS initiated operations allowed for $MFT. */
@@ -2312,8 +2313,8 @@ int ntfs_show_options(struct seq_file *sf, struct dentry *root)
2312 ntfs_volume *vol = NTFS_SB(root->d_sb); 2313 ntfs_volume *vol = NTFS_SB(root->d_sb);
2313 int i; 2314 int i;
2314 2315
2315 seq_printf(sf, ",uid=%i", vol->uid); 2316 seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
2316 seq_printf(sf, ",gid=%i", vol->gid); 2317 seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
2317 if (vol->fmask == vol->dmask) 2318 if (vol->fmask == vol->dmask)
2318 seq_printf(sf, ",umask=0%o", vol->fmask); 2319 seq_printf(sf, ",umask=0%o", vol->fmask);
2319 else { 2320 else {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 2bc149d6a784..4a8289f8b16c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -102,8 +102,8 @@ static bool parse_options(ntfs_volume *vol, char *opt)
102 char *p, *v, *ov; 102 char *p, *v, *ov;
103 static char *utf8 = "utf8"; 103 static char *utf8 = "utf8";
104 int errors = 0, sloppy = 0; 104 int errors = 0, sloppy = 0;
105 uid_t uid = (uid_t)-1; 105 kuid_t uid = INVALID_UID;
106 gid_t gid = (gid_t)-1; 106 kgid_t gid = INVALID_GID;
107 umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; 107 umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
108 int mft_zone_multiplier = -1, on_errors = -1; 108 int mft_zone_multiplier = -1, on_errors = -1;
109 int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; 109 int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
@@ -128,6 +128,30 @@ static bool parse_options(ntfs_volume *vol, char *opt)
128 if (*v) \ 128 if (*v) \
129 goto needs_val; \ 129 goto needs_val; \
130 } 130 }
131#define NTFS_GETOPT_UID(option, variable) \
132 if (!strcmp(p, option)) { \
133 uid_t uid_value; \
134 if (!v || !*v) \
135 goto needs_arg; \
136 uid_value = simple_strtoul(ov = v, &v, 0); \
137 if (*v) \
138 goto needs_val; \
139 variable = make_kuid(current_user_ns(), uid_value); \
140 if (!uid_valid(variable)) \
141 goto needs_val; \
142 }
143#define NTFS_GETOPT_GID(option, variable) \
144 if (!strcmp(p, option)) { \
145 gid_t gid_value; \
146 if (!v || !*v) \
147 goto needs_arg; \
148 gid_value = simple_strtoul(ov = v, &v, 0); \
149 if (*v) \
150 goto needs_val; \
151 variable = make_kgid(current_user_ns(), gid_value); \
152 if (!gid_valid(variable)) \
153 goto needs_val; \
154 }
131#define NTFS_GETOPT_OCTAL(option, variable) \ 155#define NTFS_GETOPT_OCTAL(option, variable) \
132 if (!strcmp(p, option)) { \ 156 if (!strcmp(p, option)) { \
133 if (!v || !*v) \ 157 if (!v || !*v) \
@@ -165,8 +189,8 @@ static bool parse_options(ntfs_volume *vol, char *opt)
165 while ((p = strsep(&opt, ","))) { 189 while ((p = strsep(&opt, ","))) {
166 if ((v = strchr(p, '='))) 190 if ((v = strchr(p, '=')))
167 *v++ = 0; 191 *v++ = 0;
168 NTFS_GETOPT("uid", uid) 192 NTFS_GETOPT_UID("uid", uid)
169 else NTFS_GETOPT("gid", gid) 193 else NTFS_GETOPT_GID("gid", gid)
170 else NTFS_GETOPT_OCTAL("umask", fmask = dmask) 194 else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
171 else NTFS_GETOPT_OCTAL("fmask", fmask) 195 else NTFS_GETOPT_OCTAL("fmask", fmask)
172 else NTFS_GETOPT_OCTAL("dmask", dmask) 196 else NTFS_GETOPT_OCTAL("dmask", dmask)
@@ -283,9 +307,9 @@ no_mount_options:
283 vol->on_errors = on_errors; 307 vol->on_errors = on_errors;
284 if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) 308 if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
285 vol->on_errors |= ON_ERRORS_CONTINUE; 309 vol->on_errors |= ON_ERRORS_CONTINUE;
286 if (uid != (uid_t)-1) 310 if (uid_valid(uid))
287 vol->uid = uid; 311 vol->uid = uid;
288 if (gid != (gid_t)-1) 312 if (gid_valid(gid))
289 vol->gid = gid; 313 vol->gid = gid;
290 if (fmask != (umode_t)-1) 314 if (fmask != (umode_t)-1)
291 vol->fmask = fmask; 315 vol->fmask = fmask;
@@ -1023,7 +1047,8 @@ static bool load_and_init_mft_mirror(ntfs_volume *vol)
1023 * ntfs_read_inode() will have set up the default ones. 1047 * ntfs_read_inode() will have set up the default ones.
1024 */ 1048 */
1025 /* Set uid and gid to root. */ 1049 /* Set uid and gid to root. */
1026 tmp_ino->i_uid = tmp_ino->i_gid = 0; 1050 tmp_ino->i_uid = GLOBAL_ROOT_UID;
1051 tmp_ino->i_gid = GLOBAL_ROOT_GID;
1027 /* Regular file. No access for anyone. */ 1052 /* Regular file. No access for anyone. */
1028 tmp_ino->i_mode = S_IFREG; 1053 tmp_ino->i_mode = S_IFREG;
1029 /* No VFS initiated operations allowed for $MFTMirr. */ 1054 /* No VFS initiated operations allowed for $MFTMirr. */
@@ -3168,6 +3193,12 @@ static void __exit exit_ntfs_fs(void)
3168 ntfs_debug("Unregistering NTFS driver."); 3193 ntfs_debug("Unregistering NTFS driver.");
3169 3194
3170 unregister_filesystem(&ntfs_fs_type); 3195 unregister_filesystem(&ntfs_fs_type);
3196
3197 /*
3198 * Make sure all delayed rcu free inodes are flushed before we
3199 * destroy cache.
3200 */
3201 rcu_barrier();
3171 kmem_cache_destroy(ntfs_big_inode_cache); 3202 kmem_cache_destroy(ntfs_big_inode_cache);
3172 kmem_cache_destroy(ntfs_inode_cache); 3203 kmem_cache_destroy(ntfs_inode_cache);
3173 kmem_cache_destroy(ntfs_name_cache); 3204 kmem_cache_destroy(ntfs_name_cache);
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 15e3ba8d521a..4f579b02bc76 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -25,6 +25,7 @@
25#define _LINUX_NTFS_VOLUME_H 25#define _LINUX_NTFS_VOLUME_H
26 26
27#include <linux/rwsem.h> 27#include <linux/rwsem.h>
28#include <linux/uidgid.h>
28 29
29#include "types.h" 30#include "types.h"
30#include "layout.h" 31#include "layout.h"
@@ -46,8 +47,8 @@ typedef struct {
46 sized blocks on the device. */ 47 sized blocks on the device. */
47 /* Configuration provided by user at mount time. */ 48 /* Configuration provided by user at mount time. */
48 unsigned long flags; /* Miscellaneous flags, see below. */ 49 unsigned long flags; /* Miscellaneous flags, see below. */
49 uid_t uid; /* uid that files will be mounted as. */ 50 kuid_t uid; /* uid that files will be mounted as. */
50 gid_t gid; /* gid that files will be mounted as. */ 51 kgid_t gid; /* gid that files will be mounted as. */
51 umode_t fmask; /* The mask for file permissions. */ 52 umode_t fmask; /* The mask for file permissions. */
52 umode_t dmask; /* The mask for directory 53 umode_t dmask; /* The mask for directory
53 permissions. */ 54 permissions. */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a7219075b4de..260b16281fc3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -452,7 +452,7 @@ static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
452 return PTR_ERR(acl); 452 return PTR_ERR(acl);
453 if (acl == NULL) 453 if (acl == NULL)
454 return -ENODATA; 454 return -ENODATA;
455 ret = posix_acl_to_xattr(acl, buffer, size); 455 ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
456 posix_acl_release(acl); 456 posix_acl_release(acl);
457 457
458 return ret; 458 return ret;
@@ -475,7 +475,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
475 return -EPERM; 475 return -EPERM;
476 476
477 if (value) { 477 if (value) {
478 acl = posix_acl_from_xattr(value, size); 478 acl = posix_acl_from_xattr(&init_user_ns, value, size);
479 if (IS_ERR(acl)) 479 if (IS_ERR(acl))
480 return PTR_ERR(acl); 480 return PTR_ERR(acl);
481 else if (acl) { 481 else if (acl) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a4e855e3690e..f7c648d7d6bf 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1746,8 +1746,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1746 long fd; 1746 long fd;
1747 int sectsize; 1747 int sectsize;
1748 char *p = (char *)page; 1748 char *p = (char *)page;
1749 struct file *filp = NULL; 1749 struct fd f;
1750 struct inode *inode = NULL; 1750 struct inode *inode;
1751 ssize_t ret = -EINVAL; 1751 ssize_t ret = -EINVAL;
1752 int live_threshold; 1752 int live_threshold;
1753 1753
@@ -1766,26 +1766,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1766 if (fd < 0 || fd >= INT_MAX) 1766 if (fd < 0 || fd >= INT_MAX)
1767 goto out; 1767 goto out;
1768 1768
1769 filp = fget(fd); 1769 f = fdget(fd);
1770 if (filp == NULL) 1770 if (f.file == NULL)
1771 goto out; 1771 goto out;
1772 1772
1773 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || 1773 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
1774 reg->hr_block_bytes == 0) 1774 reg->hr_block_bytes == 0)
1775 goto out; 1775 goto out2;
1776 1776
1777 inode = igrab(filp->f_mapping->host); 1777 inode = igrab(f.file->f_mapping->host);
1778 if (inode == NULL) 1778 if (inode == NULL)
1779 goto out; 1779 goto out2;
1780 1780
1781 if (!S_ISBLK(inode->i_mode)) 1781 if (!S_ISBLK(inode->i_mode))
1782 goto out; 1782 goto out3;
1783 1783
1784 reg->hr_bdev = I_BDEV(filp->f_mapping->host); 1784 reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
1785 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); 1785 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
1786 if (ret) { 1786 if (ret) {
1787 reg->hr_bdev = NULL; 1787 reg->hr_bdev = NULL;
1788 goto out; 1788 goto out3;
1789 } 1789 }
1790 inode = NULL; 1790 inode = NULL;
1791 1791
@@ -1797,7 +1797,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1797 "blocksize %u incorrect for device, expected %d", 1797 "blocksize %u incorrect for device, expected %d",
1798 reg->hr_block_bytes, sectsize); 1798 reg->hr_block_bytes, sectsize);
1799 ret = -EINVAL; 1799 ret = -EINVAL;
1800 goto out; 1800 goto out3;
1801 } 1801 }
1802 1802
1803 o2hb_init_region_params(reg); 1803 o2hb_init_region_params(reg);
@@ -1811,13 +1811,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1811 ret = o2hb_map_slot_data(reg); 1811 ret = o2hb_map_slot_data(reg);
1812 if (ret) { 1812 if (ret) {
1813 mlog_errno(ret); 1813 mlog_errno(ret);
1814 goto out; 1814 goto out3;
1815 } 1815 }
1816 1816
1817 ret = o2hb_populate_slot_data(reg); 1817 ret = o2hb_populate_slot_data(reg);
1818 if (ret) { 1818 if (ret) {
1819 mlog_errno(ret); 1819 mlog_errno(ret);
1820 goto out; 1820 goto out3;
1821 } 1821 }
1822 1822
1823 INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout); 1823 INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
@@ -1847,7 +1847,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1847 if (IS_ERR(hb_task)) { 1847 if (IS_ERR(hb_task)) {
1848 ret = PTR_ERR(hb_task); 1848 ret = PTR_ERR(hb_task);
1849 mlog_errno(ret); 1849 mlog_errno(ret);
1850 goto out; 1850 goto out3;
1851 } 1851 }
1852 1852
1853 spin_lock(&o2hb_live_lock); 1853 spin_lock(&o2hb_live_lock);
@@ -1863,7 +1863,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1863 1863
1864 if (reg->hr_aborted_start) { 1864 if (reg->hr_aborted_start) {
1865 ret = -EIO; 1865 ret = -EIO;
1866 goto out; 1866 goto out3;
1867 } 1867 }
1868 1868
1869 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1869 /* Ok, we were woken. Make sure it wasn't by drop_item() */
@@ -1882,11 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1882 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", 1882 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
1883 config_item_name(&reg->hr_item), reg->hr_dev_name); 1883 config_item_name(&reg->hr_item), reg->hr_dev_name);
1884 1884
1885out3:
1886 iput(inode);
1887out2:
1888 fdput(f);
1885out: 1889out:
1886 if (filp)
1887 fput(filp);
1888 if (inode)
1889 iput(inode);
1890 if (ret < 0) { 1890 if (ret < 0) {
1891 if (reg->hr_bdev) { 1891 if (reg->hr_bdev) {
1892 blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); 1892 blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 8f9cea1597af..c19897d0fe14 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -327,5 +327,5 @@ void o2quo_exit(void)
327{ 327{
328 struct o2quo_state *qs = &o2quo_state; 328 struct o2quo_state *qs = &o2quo_state;
329 329
330 flush_work_sync(&qs->qs_work); 330 flush_work(&qs->qs_work);
331} 331}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 83b6f98e0665..16b712d260d4 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -691,6 +691,11 @@ static void __exit exit_dlmfs_fs(void)
691 flush_workqueue(user_dlm_worker); 691 flush_workqueue(user_dlm_worker);
692 destroy_workqueue(user_dlm_worker); 692 destroy_workqueue(user_dlm_worker);
693 693
694 /*
695 * Make sure all delayed rcu free inodes are flushed before we
696 * destroy cache.
697 */
698 rcu_barrier();
694 kmem_cache_destroy(dlmfs_inode_cache); 699 kmem_cache_destroy(dlmfs_inode_cache);
695 700
696 bdi_destroy(&dlmfs_backing_dev_info); 701 bdi_destroy(&dlmfs_backing_dev_info);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46a1f6d75104..5a4ee77cec51 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1184,8 +1184,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1184 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1184 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1185 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1185 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1186 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1186 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1187 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1187 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1188 USRQUOTA);
1189 if (!transfer_to[USRQUOTA]) { 1188 if (!transfer_to[USRQUOTA]) {
1190 status = -ESRCH; 1189 status = -ESRCH;
1191 goto bail_unlock; 1190 goto bail_unlock;
@@ -1194,8 +1193,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1194 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 1193 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1195 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1194 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1196 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1195 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1197 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1196 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1198 GRPQUOTA);
1199 if (!transfer_to[GRPQUOTA]) { 1197 if (!transfer_to[GRPQUOTA]) {
1200 status = -ESRCH; 1198 status = -ESRCH;
1201 goto bail_unlock; 1199 goto bail_unlock;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d150372fd81d..47a87dda54ce 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,6 +173,7 @@ out:
173static const struct vm_operations_struct ocfs2_file_vm_ops = { 173static const struct vm_operations_struct ocfs2_file_vm_ops = {
174 .fault = ocfs2_fault, 174 .fault = ocfs2_fault,
175 .page_mkwrite = ocfs2_page_mkwrite, 175 .page_mkwrite = ocfs2_page_mkwrite,
176 .remap_pages = generic_file_remap_pages,
176}; 177};
177 178
178int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 179int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
188 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); 189 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
189out: 190out:
190 vma->vm_ops = &ocfs2_file_vm_ops; 191 vma->vm_ops = &ocfs2_file_vm_ops;
191 vma->vm_flags |= VM_CAN_NONLINEAR;
192 return 0; 192 return 0;
193} 193}
194 194
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0a86e302655f..332a281f217e 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -95,7 +95,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
95 struct ocfs2_global_disk_dqblk *d = dp; 95 struct ocfs2_global_disk_dqblk *d = dp;
96 struct mem_dqblk *m = &dquot->dq_dqb; 96 struct mem_dqblk *m = &dquot->dq_dqb;
97 97
98 d->dqb_id = cpu_to_le32(dquot->dq_id); 98 d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
99 d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); 99 d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
100 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); 100 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
101 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); 101 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
@@ -112,11 +112,14 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
112{ 112{
113 struct ocfs2_global_disk_dqblk *d = dp; 113 struct ocfs2_global_disk_dqblk *d = dp;
114 struct ocfs2_mem_dqinfo *oinfo = 114 struct ocfs2_mem_dqinfo *oinfo =
115 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 115 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
116 116
117 if (qtree_entry_unused(&oinfo->dqi_gi, dp)) 117 if (qtree_entry_unused(&oinfo->dqi_gi, dp))
118 return 0; 118 return 0;
119 return le32_to_cpu(d->dqb_id) == dquot->dq_id; 119
120 return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
121 le32_to_cpu(d->dqb_id)),
122 dquot->dq_id);
120} 123}
121 124
122struct qtree_fmt_operations ocfs2_global_ops = { 125struct qtree_fmt_operations ocfs2_global_ops = {
@@ -475,7 +478,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
475{ 478{
476 int err, err2; 479 int err, err2;
477 struct super_block *sb = dquot->dq_sb; 480 struct super_block *sb = dquot->dq_sb;
478 int type = dquot->dq_type; 481 int type = dquot->dq_id.type;
479 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; 482 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
480 struct ocfs2_global_disk_dqblk dqblk; 483 struct ocfs2_global_disk_dqblk dqblk;
481 s64 spacechange, inodechange; 484 s64 spacechange, inodechange;
@@ -504,7 +507,8 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
504 olditime = dquot->dq_dqb.dqb_itime; 507 olditime = dquot->dq_dqb.dqb_itime;
505 oldbtime = dquot->dq_dqb.dqb_btime; 508 oldbtime = dquot->dq_dqb.dqb_btime;
506 ocfs2_global_disk2memdqb(dquot, &dqblk); 509 ocfs2_global_disk2memdqb(dquot, &dqblk);
507 trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace, 510 trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
511 dquot->dq_dqb.dqb_curspace,
508 (long long)spacechange, 512 (long long)spacechange,
509 dquot->dq_dqb.dqb_curinodes, 513 dquot->dq_dqb.dqb_curinodes,
510 (long long)inodechange); 514 (long long)inodechange);
@@ -555,8 +559,8 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
555 err = ocfs2_qinfo_lock(info, freeing); 559 err = ocfs2_qinfo_lock(info, freeing);
556 if (err < 0) { 560 if (err < 0) {
557 mlog(ML_ERROR, "Failed to lock quota info, losing quota write" 561 mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
558 " (type=%d, id=%u)\n", dquot->dq_type, 562 " (type=%d, id=%u)\n", dquot->dq_id.type,
559 (unsigned)dquot->dq_id); 563 (unsigned)from_kqid(&init_user_ns, dquot->dq_id));
560 goto out; 564 goto out;
561 } 565 }
562 if (freeing) 566 if (freeing)
@@ -591,9 +595,10 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
591 struct ocfs2_super *osb = OCFS2_SB(sb); 595 struct ocfs2_super *osb = OCFS2_SB(sb);
592 int status = 0; 596 int status = 0;
593 597
594 trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type, 598 trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
599 dquot->dq_id.type,
595 type, sb->s_id); 600 type, sb->s_id);
596 if (type != dquot->dq_type) 601 if (type != dquot->dq_id.type)
597 goto out; 602 goto out;
598 status = ocfs2_lock_global_qf(oinfo, 1); 603 status = ocfs2_lock_global_qf(oinfo, 1);
599 if (status < 0) 604 if (status < 0)
@@ -643,7 +648,8 @@ static int ocfs2_write_dquot(struct dquot *dquot)
643 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); 648 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
644 int status = 0; 649 int status = 0;
645 650
646 trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type); 651 trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
652 dquot->dq_id.type);
647 653
648 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); 654 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
649 if (IS_ERR(handle)) { 655 if (IS_ERR(handle)) {
@@ -677,11 +683,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
677{ 683{
678 handle_t *handle; 684 handle_t *handle;
679 struct ocfs2_mem_dqinfo *oinfo = 685 struct ocfs2_mem_dqinfo *oinfo =
680 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 686 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
681 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); 687 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
682 int status = 0; 688 int status = 0;
683 689
684 trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type); 690 trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
691 dquot->dq_id.type);
685 692
686 mutex_lock(&dquot->dq_lock); 693 mutex_lock(&dquot->dq_lock);
687 /* Check whether we are not racing with some other dqget() */ 694 /* Check whether we are not racing with some other dqget() */
@@ -691,7 +698,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
691 if (status < 0) 698 if (status < 0)
692 goto out; 699 goto out;
693 handle = ocfs2_start_trans(osb, 700 handle = ocfs2_start_trans(osb,
694 ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type)); 701 ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
695 if (IS_ERR(handle)) { 702 if (IS_ERR(handle)) {
696 status = PTR_ERR(handle); 703 status = PTR_ERR(handle);
697 mlog_errno(status); 704 mlog_errno(status);
@@ -733,13 +740,14 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
733 int ex = 0; 740 int ex = 0;
734 struct super_block *sb = dquot->dq_sb; 741 struct super_block *sb = dquot->dq_sb;
735 struct ocfs2_super *osb = OCFS2_SB(sb); 742 struct ocfs2_super *osb = OCFS2_SB(sb);
736 int type = dquot->dq_type; 743 int type = dquot->dq_id.type;
737 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; 744 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
738 struct inode *gqinode = info->dqi_gqinode; 745 struct inode *gqinode = info->dqi_gqinode;
739 int need_alloc = ocfs2_global_qinit_alloc(sb, type); 746 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
740 handle_t *handle; 747 handle_t *handle;
741 748
742 trace_ocfs2_acquire_dquot(dquot->dq_id, type); 749 trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
750 type);
743 mutex_lock(&dquot->dq_lock); 751 mutex_lock(&dquot->dq_lock);
744 /* 752 /*
745 * We need an exclusive lock, because we're going to update use count 753 * We need an exclusive lock, because we're going to update use count
@@ -821,12 +829,13 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
821 int sync = 0; 829 int sync = 0;
822 int status; 830 int status;
823 struct super_block *sb = dquot->dq_sb; 831 struct super_block *sb = dquot->dq_sb;
824 int type = dquot->dq_type; 832 int type = dquot->dq_id.type;
825 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; 833 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
826 handle_t *handle; 834 handle_t *handle;
827 struct ocfs2_super *osb = OCFS2_SB(sb); 835 struct ocfs2_super *osb = OCFS2_SB(sb);
828 836
829 trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type); 837 trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
838 type);
830 839
831 /* In case user set some limits, sync dquot immediately to global 840 /* In case user set some limits, sync dquot immediately to global
832 * quota file so that information propagates quicker */ 841 * quota file so that information propagates quicker */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index f100bf70a906..27fe7ee4874c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -501,7 +501,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
501 } 501 }
502 dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + 502 dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
503 ol_dqblk_block_off(sb, chunk, bit)); 503 ol_dqblk_block_off(sb, chunk, bit));
504 dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type); 504 dquot = dqget(sb,
505 make_kqid(&init_user_ns, type,
506 le64_to_cpu(dqblk->dqb_id)));
505 if (!dquot) { 507 if (!dquot) {
506 status = -EIO; 508 status = -EIO;
507 mlog(ML_ERROR, "Failed to get quota structure " 509 mlog(ML_ERROR, "Failed to get quota structure "
@@ -881,7 +883,8 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
881 dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data 883 dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
882 + ol_dqblk_block_offset(sb, od->dq_local_off)); 884 + ol_dqblk_block_offset(sb, od->dq_local_off));
883 885
884 dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id); 886 dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
887 od->dq_dquot.dq_id));
885 spin_lock(&dq_data_lock); 888 spin_lock(&dq_data_lock);
886 dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - 889 dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
887 od->dq_origspace); 890 od->dq_origspace);
@@ -891,7 +894,7 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
891 trace_olq_set_dquot( 894 trace_olq_set_dquot(
892 (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), 895 (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
893 (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), 896 (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
894 od->dq_dquot.dq_id); 897 from_kqid(&init_user_ns, od->dq_dquot.dq_id));
895} 898}
896 899
897/* Write dquot to local quota file */ 900/* Write dquot to local quota file */
@@ -900,7 +903,7 @@ int ocfs2_local_write_dquot(struct dquot *dquot)
900 struct super_block *sb = dquot->dq_sb; 903 struct super_block *sb = dquot->dq_sb;
901 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 904 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
902 struct buffer_head *bh; 905 struct buffer_head *bh;
903 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type]; 906 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type];
904 int status; 907 int status;
905 908
906 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, 909 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
@@ -1221,7 +1224,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1221int ocfs2_create_local_dquot(struct dquot *dquot) 1224int ocfs2_create_local_dquot(struct dquot *dquot)
1222{ 1225{
1223 struct super_block *sb = dquot->dq_sb; 1226 struct super_block *sb = dquot->dq_sb;
1224 int type = dquot->dq_type; 1227 int type = dquot->dq_id.type;
1225 struct inode *lqinode = sb_dqopt(sb)->files[type]; 1228 struct inode *lqinode = sb_dqopt(sb)->files[type];
1226 struct ocfs2_quota_chunk *chunk; 1229 struct ocfs2_quota_chunk *chunk;
1227 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1230 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
@@ -1275,7 +1278,7 @@ out:
1275int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) 1278int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1276{ 1279{
1277 int status; 1280 int status;
1278 int type = dquot->dq_type; 1281 int type = dquot->dq_id.type;
1279 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1282 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1280 struct super_block *sb = dquot->dq_sb; 1283 struct super_block *sb = dquot->dq_sb;
1281 struct ocfs2_local_disk_chunk *dchunk; 1284 struct ocfs2_local_disk_chunk *dchunk;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 68f4541c2db9..0e91ec22a940 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1818,6 +1818,11 @@ static int ocfs2_initialize_mem_caches(void)
1818 1818
1819static void ocfs2_free_mem_caches(void) 1819static void ocfs2_free_mem_caches(void)
1820{ 1820{
1821 /*
1822 * Make sure all delayed rcu free inodes are flushed before we
1823 * destroy cache.
1824 */
1825 rcu_barrier();
1821 if (ocfs2_inode_cachep) 1826 if (ocfs2_inode_cachep)
1822 kmem_cache_destroy(ocfs2_inode_cachep); 1827 kmem_cache_destroy(ocfs2_inode_cachep);
1823 ocfs2_inode_cachep = NULL; 1828 ocfs2_inode_cachep = NULL;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 2c6d95257a4d..77e3cb2962b4 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -146,8 +146,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
146 be64_to_cpu(entry->e_blocks); 146 be64_to_cpu(entry->e_blocks);
147 147
148 if (omfs_allocate_block(inode->i_sb, new_block)) { 148 if (omfs_allocate_block(inode->i_sb, new_block)) {
149 entry->e_blocks = 149 be64_add_cpu(&entry->e_blocks, 1);
150 cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
151 terminator->e_blocks = ~(cpu_to_be64( 150 terminator->e_blocks = ~(cpu_to_be64(
152 be64_to_cpu(~terminator->e_blocks) + 1)); 151 be64_to_cpu(~terminator->e_blocks) + 1));
153 goto out; 152 goto out;
@@ -177,7 +176,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
177 be64_to_cpu(~terminator->e_blocks) + (u64) new_count)); 176 be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
178 177
179 /* write in new entry */ 178 /* write in new entry */
180 oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count)); 179 be32_add_cpu(&oe->e_extent_count, 1);
181 180
182out: 181out:
183 *ret_block = new_block; 182 *ret_block = new_block;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index e6213b3725d1..25d715c7c87a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -391,12 +391,16 @@ static int parse_options(char *options, struct omfs_sb_info *sbi)
391 case Opt_uid: 391 case Opt_uid:
392 if (match_int(&args[0], &option)) 392 if (match_int(&args[0], &option))
393 return 0; 393 return 0;
394 sbi->s_uid = option; 394 sbi->s_uid = make_kuid(current_user_ns(), option);
395 if (!uid_valid(sbi->s_uid))
396 return 0;
395 break; 397 break;
396 case Opt_gid: 398 case Opt_gid:
397 if (match_int(&args[0], &option)) 399 if (match_int(&args[0], &option))
398 return 0; 400 return 0;
399 sbi->s_gid = option; 401 sbi->s_gid = make_kgid(current_user_ns(), option);
402 if (!gid_valid(sbi->s_gid))
403 return 0;
400 break; 404 break;
401 case Opt_umask: 405 case Opt_umask:
402 if (match_octal(&args[0], &option)) 406 if (match_octal(&args[0], &option))
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 8941f12c6b01..f0f8bc75e609 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -19,8 +19,8 @@ struct omfs_sb_info {
19 unsigned long **s_imap; 19 unsigned long **s_imap;
20 int s_imap_size; 20 int s_imap_size;
21 struct mutex s_bitmap_lock; 21 struct mutex s_bitmap_lock;
22 int s_uid; 22 kuid_t s_uid;
23 int s_gid; 23 kgid_t s_gid;
24 int s_dmask; 24 int s_dmask;
25 int s_fmask; 25 int s_fmask;
26}; 26};
diff --git a/fs/open.c b/fs/open.c
index e1f2cdb91a4d..59071f55bf7f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -132,27 +132,27 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
132 132
133static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 133static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
134{ 134{
135 struct inode * inode; 135 struct inode *inode;
136 struct dentry *dentry; 136 struct dentry *dentry;
137 struct file * file; 137 struct fd f;
138 int error; 138 int error;
139 139
140 error = -EINVAL; 140 error = -EINVAL;
141 if (length < 0) 141 if (length < 0)
142 goto out; 142 goto out;
143 error = -EBADF; 143 error = -EBADF;
144 file = fget(fd); 144 f = fdget(fd);
145 if (!file) 145 if (!f.file)
146 goto out; 146 goto out;
147 147
148 /* explicitly opened as large or we are on 64-bit box */ 148 /* explicitly opened as large or we are on 64-bit box */
149 if (file->f_flags & O_LARGEFILE) 149 if (f.file->f_flags & O_LARGEFILE)
150 small = 0; 150 small = 0;
151 151
152 dentry = file->f_path.dentry; 152 dentry = f.file->f_path.dentry;
153 inode = dentry->d_inode; 153 inode = dentry->d_inode;
154 error = -EINVAL; 154 error = -EINVAL;
155 if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) 155 if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
156 goto out_putf; 156 goto out_putf;
157 157
158 error = -EINVAL; 158 error = -EINVAL;
@@ -165,14 +165,14 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
165 goto out_putf; 165 goto out_putf;
166 166
167 sb_start_write(inode->i_sb); 167 sb_start_write(inode->i_sb);
168 error = locks_verify_truncate(inode, file, length); 168 error = locks_verify_truncate(inode, f.file, length);
169 if (!error) 169 if (!error)
170 error = security_path_truncate(&file->f_path); 170 error = security_path_truncate(&f.file->f_path);
171 if (!error) 171 if (!error)
172 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 172 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
173 sb_end_write(inode->i_sb); 173 sb_end_write(inode->i_sb);
174out_putf: 174out_putf:
175 fput(file); 175 fdput(f);
176out: 176out:
177 return error; 177 return error;
178} 178}
@@ -276,15 +276,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
276 276
277SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 277SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
278{ 278{
279 struct file *file; 279 struct fd f = fdget(fd);
280 int error = -EBADF; 280 int error = -EBADF;
281 281
282 file = fget(fd); 282 if (f.file) {
283 if (file) { 283 error = do_fallocate(f.file, mode, offset, len);
284 error = do_fallocate(file, mode, offset, len); 284 fdput(f);
285 fput(file);
286 } 285 }
287
288 return error; 286 return error;
289} 287}
290 288
@@ -400,16 +398,15 @@ out:
400 398
401SYSCALL_DEFINE1(fchdir, unsigned int, fd) 399SYSCALL_DEFINE1(fchdir, unsigned int, fd)
402{ 400{
403 struct file *file; 401 struct fd f = fdget_raw(fd);
404 struct inode *inode; 402 struct inode *inode;
405 int error, fput_needed; 403 int error = -EBADF;
406 404
407 error = -EBADF; 405 error = -EBADF;
408 file = fget_raw_light(fd, &fput_needed); 406 if (!f.file)
409 if (!file)
410 goto out; 407 goto out;
411 408
412 inode = file->f_path.dentry->d_inode; 409 inode = f.file->f_path.dentry->d_inode;
413 410
414 error = -ENOTDIR; 411 error = -ENOTDIR;
415 if (!S_ISDIR(inode->i_mode)) 412 if (!S_ISDIR(inode->i_mode))
@@ -417,9 +414,9 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
417 414
418 error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); 415 error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
419 if (!error) 416 if (!error)
420 set_fs_pwd(current->fs, &file->f_path); 417 set_fs_pwd(current->fs, &f.file->f_path);
421out_putf: 418out_putf:
422 fput_light(file, fput_needed); 419 fdput(f);
423out: 420out:
424 return error; 421 return error;
425} 422}
@@ -481,7 +478,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
481 478
482 file = fget(fd); 479 file = fget(fd);
483 if (file) { 480 if (file) {
484 audit_inode(NULL, file->f_path.dentry); 481 audit_inode(NULL, file->f_path.dentry, 0);
485 err = chmod_common(&file->f_path, mode); 482 err = chmod_common(&file->f_path, mode);
486 fput(file); 483 fput(file);
487 } 484 }
@@ -534,7 +531,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
534 newattrs.ia_valid |= 531 newattrs.ia_valid |=
535 ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; 532 ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
536 mutex_lock(&inode->i_mutex); 533 mutex_lock(&inode->i_mutex);
537 error = security_path_chown(path, user, group); 534 error = security_path_chown(path, uid, gid);
538 if (!error) 535 if (!error)
539 error = notify_change(path->dentry, &newattrs); 536 error = notify_change(path->dentry, &newattrs);
540 mutex_unlock(&inode->i_mutex); 537 mutex_unlock(&inode->i_mutex);
@@ -582,23 +579,20 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
582 579
583SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) 580SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
584{ 581{
585 struct file * file; 582 struct fd f = fdget(fd);
586 int error = -EBADF; 583 int error = -EBADF;
587 struct dentry * dentry;
588 584
589 file = fget(fd); 585 if (!f.file)
590 if (!file)
591 goto out; 586 goto out;
592 587
593 error = mnt_want_write_file(file); 588 error = mnt_want_write_file(f.file);
594 if (error) 589 if (error)
595 goto out_fput; 590 goto out_fput;
596 dentry = file->f_path.dentry; 591 audit_inode(NULL, f.file->f_path.dentry, 0);
597 audit_inode(NULL, dentry); 592 error = chown_common(&f.file->f_path, user, group);
598 error = chown_common(&file->f_path, user, group); 593 mnt_drop_write_file(f.file);
599 mnt_drop_write_file(file);
600out_fput: 594out_fput:
601 fput(file); 595 fdput(f);
602out: 596out:
603 return error; 597 return error;
604} 598}
@@ -803,50 +797,6 @@ struct file *dentry_open(const struct path *path, int flags,
803} 797}
804EXPORT_SYMBOL(dentry_open); 798EXPORT_SYMBOL(dentry_open);
805 799
806static void __put_unused_fd(struct files_struct *files, unsigned int fd)
807{
808 struct fdtable *fdt = files_fdtable(files);
809 __clear_open_fd(fd, fdt);
810 if (fd < files->next_fd)
811 files->next_fd = fd;
812}
813
814void put_unused_fd(unsigned int fd)
815{
816 struct files_struct *files = current->files;
817 spin_lock(&files->file_lock);
818 __put_unused_fd(files, fd);
819 spin_unlock(&files->file_lock);
820}
821
822EXPORT_SYMBOL(put_unused_fd);
823
824/*
825 * Install a file pointer in the fd array.
826 *
827 * The VFS is full of places where we drop the files lock between
828 * setting the open_fds bitmap and installing the file in the file
829 * array. At any such point, we are vulnerable to a dup2() race
830 * installing a file in the array before us. We need to detect this and
831 * fput() the struct file we are about to overwrite in this case.
832 *
833 * It should never happen - if we allow dup2() do it, _really_ bad things
834 * will follow.
835 */
836
837void fd_install(unsigned int fd, struct file *file)
838{
839 struct files_struct *files = current->files;
840 struct fdtable *fdt;
841 spin_lock(&files->file_lock);
842 fdt = files_fdtable(files);
843 BUG_ON(fdt->fd[fd] != NULL);
844 rcu_assign_pointer(fdt->fd[fd], file);
845 spin_unlock(&files->file_lock);
846}
847
848EXPORT_SYMBOL(fd_install);
849
850static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) 800static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
851{ 801{
852 int lookup_flags = 0; 802 int lookup_flags = 0;
@@ -858,7 +808,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
858 op->mode = 0; 808 op->mode = 0;
859 809
860 /* Must never be set by userspace */ 810 /* Must never be set by userspace */
861 flags &= ~FMODE_NONOTIFY; 811 flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
862 812
863 /* 813 /*
864 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 814 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
@@ -909,6 +859,24 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
909} 859}
910 860
911/** 861/**
862 * file_open_name - open file and return file pointer
863 *
864 * @name: struct filename containing path to open
865 * @flags: open flags as per the open(2) second argument
866 * @mode: mode for the new file if O_CREAT is set, else ignored
867 *
868 * This is the helper to open a file from kernelspace if you really
869 * have to. But in generally you should not do this, so please move
870 * along, nothing to see here..
871 */
872struct file *file_open_name(struct filename *name, int flags, umode_t mode)
873{
874 struct open_flags op;
875 int lookup = build_open_flags(flags, mode, &op);
876 return do_filp_open(AT_FDCWD, name, &op, lookup);
877}
878
879/**
912 * filp_open - open file and return file pointer 880 * filp_open - open file and return file pointer
913 * 881 *
914 * @filename: path to open 882 * @filename: path to open
@@ -921,9 +889,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
921 */ 889 */
922struct file *filp_open(const char *filename, int flags, umode_t mode) 890struct file *filp_open(const char *filename, int flags, umode_t mode)
923{ 891{
924 struct open_flags op; 892 struct filename name = {.name = filename};
925 int lookup = build_open_flags(flags, mode, &op); 893 return file_open_name(&name, flags, mode);
926 return do_filp_open(AT_FDCWD, filename, &op, lookup);
927} 894}
928EXPORT_SYMBOL(filp_open); 895EXPORT_SYMBOL(filp_open);
929 896
@@ -945,7 +912,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
945{ 912{
946 struct open_flags op; 913 struct open_flags op;
947 int lookup = build_open_flags(flags, mode, &op); 914 int lookup = build_open_flags(flags, mode, &op);
948 char *tmp = getname(filename); 915 struct filename *tmp = getname(filename);
949 int fd = PTR_ERR(tmp); 916 int fd = PTR_ERR(tmp);
950 917
951 if (!IS_ERR(tmp)) { 918 if (!IS_ERR(tmp)) {
@@ -1038,23 +1005,7 @@ EXPORT_SYMBOL(filp_close);
1038 */ 1005 */
1039SYSCALL_DEFINE1(close, unsigned int, fd) 1006SYSCALL_DEFINE1(close, unsigned int, fd)
1040{ 1007{
1041 struct file * filp; 1008 int retval = __close_fd(current->files, fd);
1042 struct files_struct *files = current->files;
1043 struct fdtable *fdt;
1044 int retval;
1045
1046 spin_lock(&files->file_lock);
1047 fdt = files_fdtable(files);
1048 if (fd >= fdt->max_fds)
1049 goto out_unlock;
1050 filp = fdt->fd[fd];
1051 if (!filp)
1052 goto out_unlock;
1053 rcu_assign_pointer(fdt->fd[fd], NULL);
1054 __clear_close_on_exec(fd, fdt);
1055 __put_unused_fd(files, fd);
1056 spin_unlock(&files->file_lock);
1057 retval = filp_close(filp, files);
1058 1009
1059 /* can't restart close syscall because file table entry was cleared */ 1010 /* can't restart close syscall because file table entry was cleared */
1060 if (unlikely(retval == -ERESTARTSYS || 1011 if (unlikely(retval == -ERESTARTSYS ||
@@ -1064,10 +1015,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
1064 retval = -EINTR; 1015 retval = -EINTR;
1065 1016
1066 return retval; 1017 return retval;
1067
1068out_unlock:
1069 spin_unlock(&files->file_lock);
1070 return -EBADF;
1071} 1018}
1072EXPORT_SYMBOL(sys_close); 1019EXPORT_SYMBOL(sys_close);
1073 1020
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 4a3477949bca..2ad080faca34 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -463,6 +463,11 @@ static int __init init_openprom_fs(void)
463static void __exit exit_openprom_fs(void) 463static void __exit exit_openprom_fs(void)
464{ 464{
465 unregister_filesystem(&openprom_fs_type); 465 unregister_filesystem(&openprom_fs_type);
466 /*
467 * Make sure all delayed rcu free inodes are flushed before we
468 * destroy cache.
469 */
470 rcu_barrier();
466 kmem_cache_destroy(op_inode_cachep); 471 kmem_cache_destroy(op_inode_cachep);
467} 472}
468 473
diff --git a/fs/pipe.c b/fs/pipe.c
index 8d85d7068c1e..bd3479db4b62 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1064,9 +1064,8 @@ err_inode:
1064 return err; 1064 return err;
1065} 1065}
1066 1066
1067int do_pipe_flags(int *fd, int flags) 1067static int __do_pipe_flags(int *fd, struct file **files, int flags)
1068{ 1068{
1069 struct file *files[2];
1070 int error; 1069 int error;
1071 int fdw, fdr; 1070 int fdw, fdr;
1072 1071
@@ -1088,11 +1087,8 @@ int do_pipe_flags(int *fd, int flags)
1088 fdw = error; 1087 fdw = error;
1089 1088
1090 audit_fd_pair(fdr, fdw); 1089 audit_fd_pair(fdr, fdw);
1091 fd_install(fdr, files[0]);
1092 fd_install(fdw, files[1]);
1093 fd[0] = fdr; 1090 fd[0] = fdr;
1094 fd[1] = fdw; 1091 fd[1] = fdw;
1095
1096 return 0; 1092 return 0;
1097 1093
1098 err_fdr: 1094 err_fdr:
@@ -1103,21 +1099,38 @@ int do_pipe_flags(int *fd, int flags)
1103 return error; 1099 return error;
1104} 1100}
1105 1101
1102int do_pipe_flags(int *fd, int flags)
1103{
1104 struct file *files[2];
1105 int error = __do_pipe_flags(fd, files, flags);
1106 if (!error) {
1107 fd_install(fd[0], files[0]);
1108 fd_install(fd[1], files[1]);
1109 }
1110 return error;
1111}
1112
1106/* 1113/*
1107 * sys_pipe() is the normal C calling standard for creating 1114 * sys_pipe() is the normal C calling standard for creating
1108 * a pipe. It's not the way Unix traditionally does this, though. 1115 * a pipe. It's not the way Unix traditionally does this, though.
1109 */ 1116 */
1110SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1117SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1111{ 1118{
1119 struct file *files[2];
1112 int fd[2]; 1120 int fd[2];
1113 int error; 1121 int error;
1114 1122
1115 error = do_pipe_flags(fd, flags); 1123 error = __do_pipe_flags(fd, files, flags);
1116 if (!error) { 1124 if (!error) {
1117 if (copy_to_user(fildes, fd, sizeof(fd))) { 1125 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1118 sys_close(fd[0]); 1126 fput(files[0]);
1119 sys_close(fd[1]); 1127 fput(files[1]);
1128 put_unused_fd(fd[0]);
1129 put_unused_fd(fd[1]);
1120 error = -EFAULT; 1130 error = -EFAULT;
1131 } else {
1132 fd_install(fd[0], files[0]);
1133 fd_install(fd[1], files[1]);
1121 } 1134 }
1122 } 1135 }
1123 return error; 1136 return error;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5e325a42e33d..8bd2135b7f82 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -78,7 +78,8 @@ posix_acl_valid(const struct posix_acl *acl)
78{ 78{
79 const struct posix_acl_entry *pa, *pe; 79 const struct posix_acl_entry *pa, *pe;
80 int state = ACL_USER_OBJ; 80 int state = ACL_USER_OBJ;
81 unsigned int id = 0; /* keep gcc happy */ 81 kuid_t prev_uid = INVALID_UID;
82 kgid_t prev_gid = INVALID_GID;
82 int needs_mask = 0; 83 int needs_mask = 0;
83 84
84 FOREACH_ACL_ENTRY(pa, acl, pe) { 85 FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -87,7 +88,6 @@ posix_acl_valid(const struct posix_acl *acl)
87 switch (pa->e_tag) { 88 switch (pa->e_tag) {
88 case ACL_USER_OBJ: 89 case ACL_USER_OBJ:
89 if (state == ACL_USER_OBJ) { 90 if (state == ACL_USER_OBJ) {
90 id = 0;
91 state = ACL_USER; 91 state = ACL_USER;
92 break; 92 break;
93 } 93 }
@@ -96,16 +96,17 @@ posix_acl_valid(const struct posix_acl *acl)
96 case ACL_USER: 96 case ACL_USER:
97 if (state != ACL_USER) 97 if (state != ACL_USER)
98 return -EINVAL; 98 return -EINVAL;
99 if (pa->e_id == ACL_UNDEFINED_ID || 99 if (!uid_valid(pa->e_uid))
100 pa->e_id < id)
101 return -EINVAL; 100 return -EINVAL;
102 id = pa->e_id + 1; 101 if (uid_valid(prev_uid) &&
102 uid_lte(pa->e_uid, prev_uid))
103 return -EINVAL;
104 prev_uid = pa->e_uid;
103 needs_mask = 1; 105 needs_mask = 1;
104 break; 106 break;
105 107
106 case ACL_GROUP_OBJ: 108 case ACL_GROUP_OBJ:
107 if (state == ACL_USER) { 109 if (state == ACL_USER) {
108 id = 0;
109 state = ACL_GROUP; 110 state = ACL_GROUP;
110 break; 111 break;
111 } 112 }
@@ -114,10 +115,12 @@ posix_acl_valid(const struct posix_acl *acl)
114 case ACL_GROUP: 115 case ACL_GROUP:
115 if (state != ACL_GROUP) 116 if (state != ACL_GROUP)
116 return -EINVAL; 117 return -EINVAL;
117 if (pa->e_id == ACL_UNDEFINED_ID || 118 if (!gid_valid(pa->e_gid))
118 pa->e_id < id) 119 return -EINVAL;
120 if (gid_valid(prev_gid) &&
121 gid_lte(pa->e_gid, prev_gid))
119 return -EINVAL; 122 return -EINVAL;
120 id = pa->e_id + 1; 123 prev_gid = pa->e_gid;
121 needs_mask = 1; 124 needs_mask = 1;
122 break; 125 break;
123 126
@@ -195,15 +198,12 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
195 return ERR_PTR(-ENOMEM); 198 return ERR_PTR(-ENOMEM);
196 199
197 acl->a_entries[0].e_tag = ACL_USER_OBJ; 200 acl->a_entries[0].e_tag = ACL_USER_OBJ;
198 acl->a_entries[0].e_id = ACL_UNDEFINED_ID;
199 acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; 201 acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;
200 202
201 acl->a_entries[1].e_tag = ACL_GROUP_OBJ; 203 acl->a_entries[1].e_tag = ACL_GROUP_OBJ;
202 acl->a_entries[1].e_id = ACL_UNDEFINED_ID;
203 acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; 204 acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;
204 205
205 acl->a_entries[2].e_tag = ACL_OTHER; 206 acl->a_entries[2].e_tag = ACL_OTHER;
206 acl->a_entries[2].e_id = ACL_UNDEFINED_ID;
207 acl->a_entries[2].e_perm = (mode & S_IRWXO); 207 acl->a_entries[2].e_perm = (mode & S_IRWXO);
208 return acl; 208 return acl;
209} 209}
@@ -224,11 +224,11 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
224 switch(pa->e_tag) { 224 switch(pa->e_tag) {
225 case ACL_USER_OBJ: 225 case ACL_USER_OBJ:
226 /* (May have been checked already) */ 226 /* (May have been checked already) */
227 if (inode->i_uid == current_fsuid()) 227 if (uid_eq(inode->i_uid, current_fsuid()))
228 goto check_perm; 228 goto check_perm;
229 break; 229 break;
230 case ACL_USER: 230 case ACL_USER:
231 if (pa->e_id == current_fsuid()) 231 if (uid_eq(pa->e_uid, current_fsuid()))
232 goto mask; 232 goto mask;
233 break; 233 break;
234 case ACL_GROUP_OBJ: 234 case ACL_GROUP_OBJ:
@@ -239,7 +239,7 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
239 } 239 }
240 break; 240 break;
241 case ACL_GROUP: 241 case ACL_GROUP:
242 if (in_group_p(pa->e_id)) { 242 if (in_group_p(pa->e_gid)) {
243 found = 1; 243 found = 1;
244 if ((pa->e_perm & want) == want) 244 if ((pa->e_perm & want) == want)
245 goto mask; 245 goto mask;
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index c1c729335924..99349efbbc2b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y := nommu.o task_nommu.o
8proc-$(CONFIG_MMU) := mmu.o task_mmu.o 8proc-$(CONFIG_MMU) := mmu.o task_mmu.o
9 9
10proc-y += inode.o root.o base.o generic.o array.o \ 10proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o 11 proc_tty.o fd.o
12proc-y += cmdline.o 12proc-y += cmdline.o
13proc-y += consoles.o 13proc-y += consoles.o
14proc-y += cpuinfo.o 14proc-y += cpuinfo.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84cbdb73..144a96732dd7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -90,6 +90,7 @@
90#endif 90#endif
91#include <trace/events/oom.h> 91#include <trace/events/oom.h>
92#include "internal.h" 92#include "internal.h"
93#include "fd.h"
93 94
94/* NOTE: 95/* NOTE:
95 * Implementing inode permission operations in /proc is almost 96 * Implementing inode permission operations in /proc is almost
@@ -136,8 +137,6 @@ struct pid_entry {
136 NULL, &proc_single_file_operations, \ 137 NULL, &proc_single_file_operations, \
137 { .proc_show = show } ) 138 { .proc_show = show } )
138 139
139static int proc_fd_permission(struct inode *inode, int mask);
140
141/* 140/*
142 * Count the number of hardlinks for the pid_entry table, excluding the . 141 * Count the number of hardlinks for the pid_entry table, excluding the .
143 * and .. links. 142 * and .. links.
@@ -874,111 +873,6 @@ static const struct file_operations proc_environ_operations = {
874 .release = mem_release, 873 .release = mem_release,
875}; 874};
876 875
877static ssize_t oom_adjust_read(struct file *file, char __user *buf,
878 size_t count, loff_t *ppos)
879{
880 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
881 char buffer[PROC_NUMBUF];
882 size_t len;
883 int oom_adjust = OOM_DISABLE;
884 unsigned long flags;
885
886 if (!task)
887 return -ESRCH;
888
889 if (lock_task_sighand(task, &flags)) {
890 oom_adjust = task->signal->oom_adj;
891 unlock_task_sighand(task, &flags);
892 }
893
894 put_task_struct(task);
895
896 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
897
898 return simple_read_from_buffer(buf, count, ppos, buffer, len);
899}
900
901static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
902 size_t count, loff_t *ppos)
903{
904 struct task_struct *task;
905 char buffer[PROC_NUMBUF];
906 int oom_adjust;
907 unsigned long flags;
908 int err;
909
910 memset(buffer, 0, sizeof(buffer));
911 if (count > sizeof(buffer) - 1)
912 count = sizeof(buffer) - 1;
913 if (copy_from_user(buffer, buf, count)) {
914 err = -EFAULT;
915 goto out;
916 }
917
918 err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
919 if (err)
920 goto out;
921 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
922 oom_adjust != OOM_DISABLE) {
923 err = -EINVAL;
924 goto out;
925 }
926
927 task = get_proc_task(file->f_path.dentry->d_inode);
928 if (!task) {
929 err = -ESRCH;
930 goto out;
931 }
932
933 task_lock(task);
934 if (!task->mm) {
935 err = -EINVAL;
936 goto err_task_lock;
937 }
938
939 if (!lock_task_sighand(task, &flags)) {
940 err = -ESRCH;
941 goto err_task_lock;
942 }
943
944 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
945 err = -EACCES;
946 goto err_sighand;
947 }
948
949 /*
950 * Warn that /proc/pid/oom_adj is deprecated, see
951 * Documentation/feature-removal-schedule.txt.
952 */
953 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
954 current->comm, task_pid_nr(current), task_pid_nr(task),
955 task_pid_nr(task));
956 task->signal->oom_adj = oom_adjust;
957 /*
958 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
959 * value is always attainable.
960 */
961 if (task->signal->oom_adj == OOM_ADJUST_MAX)
962 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
963 else
964 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
965 -OOM_DISABLE;
966 trace_oom_score_adj_update(task);
967err_sighand:
968 unlock_task_sighand(task, &flags);
969err_task_lock:
970 task_unlock(task);
971 put_task_struct(task);
972out:
973 return err < 0 ? err : count;
974}
975
976static const struct file_operations proc_oom_adjust_operations = {
977 .read = oom_adjust_read,
978 .write = oom_adjust_write,
979 .llseek = generic_file_llseek,
980};
981
982static ssize_t oom_score_adj_read(struct file *file, char __user *buf, 876static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
983 size_t count, loff_t *ppos) 877 size_t count, loff_t *ppos)
984{ 878{
@@ -1052,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1052 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 946 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1053 task->signal->oom_score_adj_min = oom_score_adj; 947 task->signal->oom_score_adj_min = oom_score_adj;
1054 trace_oom_score_adj_update(task); 948 trace_oom_score_adj_update(task);
1055 /* 949
1056 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1057 * always attainable.
1058 */
1059 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1060 task->signal->oom_adj = OOM_DISABLE;
1061 else
1062 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1063 OOM_SCORE_ADJ_MAX;
1064err_sighand: 950err_sighand:
1065 unlock_task_sighand(task, &flags); 951 unlock_task_sighand(task, &flags);
1066err_task_lock: 952err_task_lock:
@@ -1089,7 +975,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1089 if (!task) 975 if (!task)
1090 return -ESRCH; 976 return -ESRCH;
1091 length = scnprintf(tmpbuf, TMPBUFLEN, "%u", 977 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1092 audit_get_loginuid(task)); 978 from_kuid(file->f_cred->user_ns,
979 audit_get_loginuid(task)));
1093 put_task_struct(task); 980 put_task_struct(task);
1094 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); 981 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1095} 982}
@@ -1101,6 +988,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1101 char *page, *tmp; 988 char *page, *tmp;
1102 ssize_t length; 989 ssize_t length;
1103 uid_t loginuid; 990 uid_t loginuid;
991 kuid_t kloginuid;
1104 992
1105 rcu_read_lock(); 993 rcu_read_lock();
1106 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 994 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1130,7 +1018,13 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1130 goto out_free_page; 1018 goto out_free_page;
1131 1019
1132 } 1020 }
1133 length = audit_set_loginuid(loginuid); 1021 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1022 if (!uid_valid(kloginuid)) {
1023 length = -EINVAL;
1024 goto out_free_page;
1025 }
1026
1027 length = audit_set_loginuid(kloginuid);
1134 if (likely(length == 0)) 1028 if (likely(length == 0))
1135 length = count; 1029 length = count;
1136 1030
@@ -1492,7 +1386,7 @@ out:
1492 return error; 1386 return error;
1493} 1387}
1494 1388
1495static const struct inode_operations proc_pid_link_inode_operations = { 1389const struct inode_operations proc_pid_link_inode_operations = {
1496 .readlink = proc_pid_readlink, 1390 .readlink = proc_pid_readlink,
1497 .follow_link = proc_pid_follow_link, 1391 .follow_link = proc_pid_follow_link,
1498 .setattr = proc_setattr, 1392 .setattr = proc_setattr,
@@ -1501,21 +1395,6 @@ static const struct inode_operations proc_pid_link_inode_operations = {
1501 1395
1502/* building an inode */ 1396/* building an inode */
1503 1397
1504static int task_dumpable(struct task_struct *task)
1505{
1506 int dumpable = 0;
1507 struct mm_struct *mm;
1508
1509 task_lock(task);
1510 mm = task->mm;
1511 if (mm)
1512 dumpable = get_dumpable(mm);
1513 task_unlock(task);
1514 if(dumpable == 1)
1515 return 1;
1516 return 0;
1517}
1518
1519struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) 1398struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1520{ 1399{
1521 struct inode * inode; 1400 struct inode * inode;
@@ -1641,15 +1520,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1641 return 0; 1520 return 0;
1642} 1521}
1643 1522
1644static int pid_delete_dentry(const struct dentry * dentry)
1645{
1646 /* Is the task we represent dead?
1647 * If so, then don't put the dentry on the lru list,
1648 * kill it immediately.
1649 */
1650 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1651}
1652
1653const struct dentry_operations pid_dentry_operations = 1523const struct dentry_operations pid_dentry_operations =
1654{ 1524{
1655 .d_revalidate = pid_revalidate, 1525 .d_revalidate = pid_revalidate,
@@ -1712,289 +1582,6 @@ end_instantiate:
1712 return filldir(dirent, name, len, filp->f_pos, ino, type); 1582 return filldir(dirent, name, len, filp->f_pos, ino, type);
1713} 1583}
1714 1584
1715static unsigned name_to_int(struct dentry *dentry)
1716{
1717 const char *name = dentry->d_name.name;
1718 int len = dentry->d_name.len;
1719 unsigned n = 0;
1720
1721 if (len > 1 && *name == '0')
1722 goto out;
1723 while (len-- > 0) {
1724 unsigned c = *name++ - '0';
1725 if (c > 9)
1726 goto out;
1727 if (n >= (~0U-9)/10)
1728 goto out;
1729 n *= 10;
1730 n += c;
1731 }
1732 return n;
1733out:
1734 return ~0U;
1735}
1736
1737#define PROC_FDINFO_MAX 64
1738
1739static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1740{
1741 struct task_struct *task = get_proc_task(inode);
1742 struct files_struct *files = NULL;
1743 struct file *file;
1744 int fd = proc_fd(inode);
1745
1746 if (task) {
1747 files = get_files_struct(task);
1748 put_task_struct(task);
1749 }
1750 if (files) {
1751 /*
1752 * We are not taking a ref to the file structure, so we must
1753 * hold ->file_lock.
1754 */
1755 spin_lock(&files->file_lock);
1756 file = fcheck_files(files, fd);
1757 if (file) {
1758 unsigned int f_flags;
1759 struct fdtable *fdt;
1760
1761 fdt = files_fdtable(files);
1762 f_flags = file->f_flags & ~O_CLOEXEC;
1763 if (close_on_exec(fd, fdt))
1764 f_flags |= O_CLOEXEC;
1765
1766 if (path) {
1767 *path = file->f_path;
1768 path_get(&file->f_path);
1769 }
1770 if (info)
1771 snprintf(info, PROC_FDINFO_MAX,
1772 "pos:\t%lli\n"
1773 "flags:\t0%o\n",
1774 (long long) file->f_pos,
1775 f_flags);
1776 spin_unlock(&files->file_lock);
1777 put_files_struct(files);
1778 return 0;
1779 }
1780 spin_unlock(&files->file_lock);
1781 put_files_struct(files);
1782 }
1783 return -ENOENT;
1784}
1785
1786static int proc_fd_link(struct dentry *dentry, struct path *path)
1787{
1788 return proc_fd_info(dentry->d_inode, path, NULL);
1789}
1790
1791static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
1792{
1793 struct inode *inode;
1794 struct task_struct *task;
1795 int fd;
1796 struct files_struct *files;
1797 const struct cred *cred;
1798
1799 if (flags & LOOKUP_RCU)
1800 return -ECHILD;
1801
1802 inode = dentry->d_inode;
1803 task = get_proc_task(inode);
1804 fd = proc_fd(inode);
1805
1806 if (task) {
1807 files = get_files_struct(task);
1808 if (files) {
1809 struct file *file;
1810 rcu_read_lock();
1811 file = fcheck_files(files, fd);
1812 if (file) {
1813 unsigned f_mode = file->f_mode;
1814
1815 rcu_read_unlock();
1816 put_files_struct(files);
1817
1818 if (task_dumpable(task)) {
1819 rcu_read_lock();
1820 cred = __task_cred(task);
1821 inode->i_uid = cred->euid;
1822 inode->i_gid = cred->egid;
1823 rcu_read_unlock();
1824 } else {
1825 inode->i_uid = GLOBAL_ROOT_UID;
1826 inode->i_gid = GLOBAL_ROOT_GID;
1827 }
1828
1829 if (S_ISLNK(inode->i_mode)) {
1830 unsigned i_mode = S_IFLNK;
1831 if (f_mode & FMODE_READ)
1832 i_mode |= S_IRUSR | S_IXUSR;
1833 if (f_mode & FMODE_WRITE)
1834 i_mode |= S_IWUSR | S_IXUSR;
1835 inode->i_mode = i_mode;
1836 }
1837
1838 security_task_to_inode(task, inode);
1839 put_task_struct(task);
1840 return 1;
1841 }
1842 rcu_read_unlock();
1843 put_files_struct(files);
1844 }
1845 put_task_struct(task);
1846 }
1847 d_drop(dentry);
1848 return 0;
1849}
1850
1851static const struct dentry_operations tid_fd_dentry_operations =
1852{
1853 .d_revalidate = tid_fd_revalidate,
1854 .d_delete = pid_delete_dentry,
1855};
1856
1857static struct dentry *proc_fd_instantiate(struct inode *dir,
1858 struct dentry *dentry, struct task_struct *task, const void *ptr)
1859{
1860 unsigned fd = (unsigned long)ptr;
1861 struct inode *inode;
1862 struct proc_inode *ei;
1863 struct dentry *error = ERR_PTR(-ENOENT);
1864
1865 inode = proc_pid_make_inode(dir->i_sb, task);
1866 if (!inode)
1867 goto out;
1868 ei = PROC_I(inode);
1869 ei->fd = fd;
1870
1871 inode->i_mode = S_IFLNK;
1872 inode->i_op = &proc_pid_link_inode_operations;
1873 inode->i_size = 64;
1874 ei->op.proc_get_link = proc_fd_link;
1875 d_set_d_op(dentry, &tid_fd_dentry_operations);
1876 d_add(dentry, inode);
1877 /* Close the race of the process dying before we return the dentry */
1878 if (tid_fd_revalidate(dentry, 0))
1879 error = NULL;
1880
1881 out:
1882 return error;
1883}
1884
1885static struct dentry *proc_lookupfd_common(struct inode *dir,
1886 struct dentry *dentry,
1887 instantiate_t instantiate)
1888{
1889 struct task_struct *task = get_proc_task(dir);
1890 unsigned fd = name_to_int(dentry);
1891 struct dentry *result = ERR_PTR(-ENOENT);
1892
1893 if (!task)
1894 goto out_no_task;
1895 if (fd == ~0U)
1896 goto out;
1897
1898 result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
1899out:
1900 put_task_struct(task);
1901out_no_task:
1902 return result;
1903}
1904
1905static int proc_readfd_common(struct file * filp, void * dirent,
1906 filldir_t filldir, instantiate_t instantiate)
1907{
1908 struct dentry *dentry = filp->f_path.dentry;
1909 struct inode *inode = dentry->d_inode;
1910 struct task_struct *p = get_proc_task(inode);
1911 unsigned int fd, ino;
1912 int retval;
1913 struct files_struct * files;
1914
1915 retval = -ENOENT;
1916 if (!p)
1917 goto out_no_task;
1918 retval = 0;
1919
1920 fd = filp->f_pos;
1921 switch (fd) {
1922 case 0:
1923 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1924 goto out;
1925 filp->f_pos++;
1926 case 1:
1927 ino = parent_ino(dentry);
1928 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1929 goto out;
1930 filp->f_pos++;
1931 default:
1932 files = get_files_struct(p);
1933 if (!files)
1934 goto out;
1935 rcu_read_lock();
1936 for (fd = filp->f_pos-2;
1937 fd < files_fdtable(files)->max_fds;
1938 fd++, filp->f_pos++) {
1939 char name[PROC_NUMBUF];
1940 int len;
1941 int rv;
1942
1943 if (!fcheck_files(files, fd))
1944 continue;
1945 rcu_read_unlock();
1946
1947 len = snprintf(name, sizeof(name), "%d", fd);
1948 rv = proc_fill_cache(filp, dirent, filldir,
1949 name, len, instantiate, p,
1950 (void *)(unsigned long)fd);
1951 if (rv < 0)
1952 goto out_fd_loop;
1953 rcu_read_lock();
1954 }
1955 rcu_read_unlock();
1956out_fd_loop:
1957 put_files_struct(files);
1958 }
1959out:
1960 put_task_struct(p);
1961out_no_task:
1962 return retval;
1963}
1964
1965static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1966 unsigned int flags)
1967{
1968 return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1969}
1970
1971static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1972{
1973 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1974}
1975
1976static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1977 size_t len, loff_t *ppos)
1978{
1979 char tmp[PROC_FDINFO_MAX];
1980 int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
1981 if (!err)
1982 err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1983 return err;
1984}
1985
1986static const struct file_operations proc_fdinfo_file_operations = {
1987 .open = nonseekable_open,
1988 .read = proc_fdinfo_read,
1989 .llseek = no_llseek,
1990};
1991
1992static const struct file_operations proc_fd_operations = {
1993 .read = generic_read_dir,
1994 .readdir = proc_readfd,
1995 .llseek = default_llseek,
1996};
1997
1998#ifdef CONFIG_CHECKPOINT_RESTORE 1585#ifdef CONFIG_CHECKPOINT_RESTORE
1999 1586
2000/* 1587/*
@@ -2113,7 +1700,7 @@ out:
2113} 1700}
2114 1701
2115struct map_files_info { 1702struct map_files_info {
2116 struct file *file; 1703 fmode_t mode;
2117 unsigned long len; 1704 unsigned long len;
2118 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1705 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
2119}; 1706};
@@ -2122,13 +1709,10 @@ static struct dentry *
2122proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1709proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
2123 struct task_struct *task, const void *ptr) 1710 struct task_struct *task, const void *ptr)
2124{ 1711{
2125 const struct file *file = ptr; 1712 fmode_t mode = (fmode_t)(unsigned long)ptr;
2126 struct proc_inode *ei; 1713 struct proc_inode *ei;
2127 struct inode *inode; 1714 struct inode *inode;
2128 1715
2129 if (!file)
2130 return ERR_PTR(-ENOENT);
2131
2132 inode = proc_pid_make_inode(dir->i_sb, task); 1716 inode = proc_pid_make_inode(dir->i_sb, task);
2133 if (!inode) 1717 if (!inode)
2134 return ERR_PTR(-ENOENT); 1718 return ERR_PTR(-ENOENT);
@@ -2140,9 +1724,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
2140 inode->i_size = 64; 1724 inode->i_size = 64;
2141 inode->i_mode = S_IFLNK; 1725 inode->i_mode = S_IFLNK;
2142 1726
2143 if (file->f_mode & FMODE_READ) 1727 if (mode & FMODE_READ)
2144 inode->i_mode |= S_IRUSR; 1728 inode->i_mode |= S_IRUSR;
2145 if (file->f_mode & FMODE_WRITE) 1729 if (mode & FMODE_WRITE)
2146 inode->i_mode |= S_IWUSR; 1730 inode->i_mode |= S_IWUSR;
2147 1731
2148 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1732 d_set_d_op(dentry, &tid_map_files_dentry_operations);
@@ -2186,7 +1770,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
2186 if (!vma) 1770 if (!vma)
2187 goto out_no_vma; 1771 goto out_no_vma;
2188 1772
2189 result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); 1773 result = proc_map_files_instantiate(dir, dentry, task,
1774 (void *)(unsigned long)vma->vm_file->f_mode);
2190 1775
2191out_no_vma: 1776out_no_vma:
2192 up_read(&mm->mmap_sem); 1777 up_read(&mm->mmap_sem);
@@ -2287,8 +1872,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
2287 if (++pos <= filp->f_pos) 1872 if (++pos <= filp->f_pos)
2288 continue; 1873 continue;
2289 1874
2290 get_file(vma->vm_file); 1875 info.mode = vma->vm_file->f_mode;
2291 info.file = vma->vm_file;
2292 info.len = snprintf(info.name, 1876 info.len = snprintf(info.name,
2293 sizeof(info.name), "%lx-%lx", 1877 sizeof(info.name), "%lx-%lx",
2294 vma->vm_start, vma->vm_end); 1878 vma->vm_start, vma->vm_end);
@@ -2303,19 +1887,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
2303 ret = proc_fill_cache(filp, dirent, filldir, 1887 ret = proc_fill_cache(filp, dirent, filldir,
2304 p->name, p->len, 1888 p->name, p->len,
2305 proc_map_files_instantiate, 1889 proc_map_files_instantiate,
2306 task, p->file); 1890 task,
1891 (void *)(unsigned long)p->mode);
2307 if (ret) 1892 if (ret)
2308 break; 1893 break;
2309 filp->f_pos++; 1894 filp->f_pos++;
2310 fput(p->file);
2311 }
2312 for (; i < nr_files; i++) {
2313 /*
2314 * In case of error don't forget
2315 * to put rest of file refs.
2316 */
2317 p = flex_array_get(fa, i);
2318 fput(p->file);
2319 } 1895 }
2320 if (fa) 1896 if (fa)
2321 flex_array_free(fa); 1897 flex_array_free(fa);
@@ -2337,82 +1913,6 @@ static const struct file_operations proc_map_files_operations = {
2337 1913
2338#endif /* CONFIG_CHECKPOINT_RESTORE */ 1914#endif /* CONFIG_CHECKPOINT_RESTORE */
2339 1915
2340/*
2341 * /proc/pid/fd needs a special permission handler so that a process can still
2342 * access /proc/self/fd after it has executed a setuid().
2343 */
2344static int proc_fd_permission(struct inode *inode, int mask)
2345{
2346 int rv = generic_permission(inode, mask);
2347 if (rv == 0)
2348 return 0;
2349 if (task_pid(current) == proc_pid(inode))
2350 rv = 0;
2351 return rv;
2352}
2353
2354/*
2355 * proc directories can do almost nothing..
2356 */
2357static const struct inode_operations proc_fd_inode_operations = {
2358 .lookup = proc_lookupfd,
2359 .permission = proc_fd_permission,
2360 .setattr = proc_setattr,
2361};
2362
2363static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
2364 struct dentry *dentry, struct task_struct *task, const void *ptr)
2365{
2366 unsigned fd = (unsigned long)ptr;
2367 struct inode *inode;
2368 struct proc_inode *ei;
2369 struct dentry *error = ERR_PTR(-ENOENT);
2370
2371 inode = proc_pid_make_inode(dir->i_sb, task);
2372 if (!inode)
2373 goto out;
2374 ei = PROC_I(inode);
2375 ei->fd = fd;
2376 inode->i_mode = S_IFREG | S_IRUSR;
2377 inode->i_fop = &proc_fdinfo_file_operations;
2378 d_set_d_op(dentry, &tid_fd_dentry_operations);
2379 d_add(dentry, inode);
2380 /* Close the race of the process dying before we return the dentry */
2381 if (tid_fd_revalidate(dentry, 0))
2382 error = NULL;
2383
2384 out:
2385 return error;
2386}
2387
2388static struct dentry *proc_lookupfdinfo(struct inode *dir,
2389 struct dentry *dentry,
2390 unsigned int flags)
2391{
2392 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
2393}
2394
2395static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2396{
2397 return proc_readfd_common(filp, dirent, filldir,
2398 proc_fdinfo_instantiate);
2399}
2400
2401static const struct file_operations proc_fdinfo_operations = {
2402 .read = generic_read_dir,
2403 .readdir = proc_readfdinfo,
2404 .llseek = default_llseek,
2405};
2406
2407/*
2408 * proc directories can do almost nothing..
2409 */
2410static const struct inode_operations proc_fdinfo_inode_operations = {
2411 .lookup = proc_lookupfdinfo,
2412 .setattr = proc_setattr,
2413};
2414
2415
2416static struct dentry *proc_pident_instantiate(struct inode *dir, 1916static struct dentry *proc_pident_instantiate(struct inode *dir,
2417 struct dentry *dentry, struct task_struct *task, const void *ptr) 1917 struct dentry *dentry, struct task_struct *task, const void *ptr)
2418{ 1918{
@@ -2758,7 +2258,8 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2758 pid_t tgid = task_tgid_nr_ns(current, ns); 2258 pid_t tgid = task_tgid_nr_ns(current, ns);
2759 char *name = ERR_PTR(-ENOENT); 2259 char *name = ERR_PTR(-ENOENT);
2760 if (tgid) { 2260 if (tgid) {
2761 name = __getname(); 2261 /* 11 for max length of signed int in decimal + NULL term */
2262 name = kmalloc(12, GFP_KERNEL);
2762 if (!name) 2263 if (!name)
2763 name = ERR_PTR(-ENOMEM); 2264 name = ERR_PTR(-ENOMEM);
2764 else 2265 else
@@ -2773,7 +2274,7 @@ static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2773{ 2274{
2774 char *s = nd_get_link(nd); 2275 char *s = nd_get_link(nd);
2775 if (!IS_ERR(s)) 2276 if (!IS_ERR(s))
2776 __putname(s); 2277 kfree(s);
2777} 2278}
2778 2279
2779static const struct inode_operations proc_self_inode_operations = { 2280static const struct inode_operations proc_self_inode_operations = {
@@ -2983,6 +2484,11 @@ static int proc_gid_map_open(struct inode *inode, struct file *file)
2983 return proc_id_map_open(inode, file, &proc_gid_seq_operations); 2484 return proc_id_map_open(inode, file, &proc_gid_seq_operations);
2984} 2485}
2985 2486
2487static int proc_projid_map_open(struct inode *inode, struct file *file)
2488{
2489 return proc_id_map_open(inode, file, &proc_projid_seq_operations);
2490}
2491
2986static const struct file_operations proc_uid_map_operations = { 2492static const struct file_operations proc_uid_map_operations = {
2987 .open = proc_uid_map_open, 2493 .open = proc_uid_map_open,
2988 .write = proc_uid_map_write, 2494 .write = proc_uid_map_write,
@@ -2998,6 +2504,14 @@ static const struct file_operations proc_gid_map_operations = {
2998 .llseek = seq_lseek, 2504 .llseek = seq_lseek,
2999 .release = proc_id_map_release, 2505 .release = proc_id_map_release,
3000}; 2506};
2507
2508static const struct file_operations proc_projid_map_operations = {
2509 .open = proc_projid_map_open,
2510 .write = proc_projid_map_write,
2511 .read = seq_read,
2512 .llseek = seq_lseek,
2513 .release = proc_id_map_release,
2514};
3001#endif /* CONFIG_USER_NS */ 2515#endif /* CONFIG_USER_NS */
3002 2516
3003static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2517static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -3084,7 +2598,6 @@ static const struct pid_entry tgid_base_stuff[] = {
3084 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2598 REG("cgroup", S_IRUGO, proc_cgroup_operations),
3085#endif 2599#endif
3086 INF("oom_score", S_IRUGO, proc_oom_score), 2600 INF("oom_score", S_IRUGO, proc_oom_score),
3087 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3088 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2601 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3089#ifdef CONFIG_AUDITSYSCALL 2602#ifdef CONFIG_AUDITSYSCALL
3090 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2603 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3105,6 +2618,7 @@ static const struct pid_entry tgid_base_stuff[] = {
3105#ifdef CONFIG_USER_NS 2618#ifdef CONFIG_USER_NS
3106 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2619 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
3107 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2620 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2621 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3108#endif 2622#endif
3109}; 2623};
3110 2624
@@ -3450,7 +2964,6 @@ static const struct pid_entry tid_base_stuff[] = {
3450 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2964 REG("cgroup", S_IRUGO, proc_cgroup_operations),
3451#endif 2965#endif
3452 INF("oom_score", S_IRUGO, proc_oom_score), 2966 INF("oom_score", S_IRUGO, proc_oom_score),
3453 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3454 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2967 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3455#ifdef CONFIG_AUDITSYSCALL 2968#ifdef CONFIG_AUDITSYSCALL
3456 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2969 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3468,6 +2981,7 @@ static const struct pid_entry tid_base_stuff[] = {
3468#ifdef CONFIG_USER_NS 2981#ifdef CONFIG_USER_NS
3469 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2982 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
3470 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2983 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2984 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3471#endif 2985#endif
3472}; 2986};
3473 2987
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
new file mode 100644
index 000000000000..f28a875f8779
--- /dev/null
+++ b/fs/proc/fd.c
@@ -0,0 +1,367 @@
1#include <linux/sched.h>
2#include <linux/errno.h>
3#include <linux/dcache.h>
4#include <linux/path.h>
5#include <linux/fdtable.h>
6#include <linux/namei.h>
7#include <linux/pid.h>
8#include <linux/security.h>
9#include <linux/file.h>
10#include <linux/seq_file.h>
11
12#include <linux/proc_fs.h>
13
14#include "internal.h"
15#include "fd.h"
16
17static int seq_show(struct seq_file *m, void *v)
18{
19 struct files_struct *files = NULL;
20 int f_flags = 0, ret = -ENOENT;
21 struct file *file = NULL;
22 struct task_struct *task;
23
24 task = get_proc_task(m->private);
25 if (!task)
26 return -ENOENT;
27
28 files = get_files_struct(task);
29 put_task_struct(task);
30
31 if (files) {
32 int fd = proc_fd(m->private);
33
34 spin_lock(&files->file_lock);
35 file = fcheck_files(files, fd);
36 if (file) {
37 struct fdtable *fdt = files_fdtable(files);
38
39 f_flags = file->f_flags;
40 if (close_on_exec(fd, fdt))
41 f_flags |= O_CLOEXEC;
42
43 get_file(file);
44 ret = 0;
45 }
46 spin_unlock(&files->file_lock);
47 put_files_struct(files);
48 }
49
50 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
52 (long long)file->f_pos, f_flags);
53 fput(file);
54 }
55
56 return ret;
57}
58
59static int seq_fdinfo_open(struct inode *inode, struct file *file)
60{
61 return single_open(file, seq_show, inode);
62}
63
64static const struct file_operations proc_fdinfo_file_operations = {
65 .open = seq_fdinfo_open,
66 .read = seq_read,
67 .llseek = seq_lseek,
68 .release = single_release,
69};
70
71static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
72{
73 struct files_struct *files;
74 struct task_struct *task;
75 const struct cred *cred;
76 struct inode *inode;
77 int fd;
78
79 if (flags & LOOKUP_RCU)
80 return -ECHILD;
81
82 inode = dentry->d_inode;
83 task = get_proc_task(inode);
84 fd = proc_fd(inode);
85
86 if (task) {
87 files = get_files_struct(task);
88 if (files) {
89 struct file *file;
90
91 rcu_read_lock();
92 file = fcheck_files(files, fd);
93 if (file) {
94 unsigned f_mode = file->f_mode;
95
96 rcu_read_unlock();
97 put_files_struct(files);
98
99 if (task_dumpable(task)) {
100 rcu_read_lock();
101 cred = __task_cred(task);
102 inode->i_uid = cred->euid;
103 inode->i_gid = cred->egid;
104 rcu_read_unlock();
105 } else {
106 inode->i_uid = GLOBAL_ROOT_UID;
107 inode->i_gid = GLOBAL_ROOT_GID;
108 }
109
110 if (S_ISLNK(inode->i_mode)) {
111 unsigned i_mode = S_IFLNK;
112 if (f_mode & FMODE_READ)
113 i_mode |= S_IRUSR | S_IXUSR;
114 if (f_mode & FMODE_WRITE)
115 i_mode |= S_IWUSR | S_IXUSR;
116 inode->i_mode = i_mode;
117 }
118
119 security_task_to_inode(task, inode);
120 put_task_struct(task);
121 return 1;
122 }
123 rcu_read_unlock();
124 put_files_struct(files);
125 }
126 put_task_struct(task);
127 }
128
129 d_drop(dentry);
130 return 0;
131}
132
133static const struct dentry_operations tid_fd_dentry_operations = {
134 .d_revalidate = tid_fd_revalidate,
135 .d_delete = pid_delete_dentry,
136};
137
138static int proc_fd_link(struct dentry *dentry, struct path *path)
139{
140 struct files_struct *files = NULL;
141 struct task_struct *task;
142 int ret = -ENOENT;
143
144 task = get_proc_task(dentry->d_inode);
145 if (task) {
146 files = get_files_struct(task);
147 put_task_struct(task);
148 }
149
150 if (files) {
151 int fd = proc_fd(dentry->d_inode);
152 struct file *fd_file;
153
154 spin_lock(&files->file_lock);
155 fd_file = fcheck_files(files, fd);
156 if (fd_file) {
157 *path = fd_file->f_path;
158 path_get(&fd_file->f_path);
159 ret = 0;
160 }
161 spin_unlock(&files->file_lock);
162 put_files_struct(files);
163 }
164
165 return ret;
166}
167
168static struct dentry *
169proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
170 struct task_struct *task, const void *ptr)
171{
172 struct dentry *error = ERR_PTR(-ENOENT);
173 unsigned fd = (unsigned long)ptr;
174 struct proc_inode *ei;
175 struct inode *inode;
176
177 inode = proc_pid_make_inode(dir->i_sb, task);
178 if (!inode)
179 goto out;
180
181 ei = PROC_I(inode);
182 ei->fd = fd;
183
184 inode->i_mode = S_IFLNK;
185 inode->i_op = &proc_pid_link_inode_operations;
186 inode->i_size = 64;
187
188 ei->op.proc_get_link = proc_fd_link;
189
190 d_set_d_op(dentry, &tid_fd_dentry_operations);
191 d_add(dentry, inode);
192
193 /* Close the race of the process dying before we return the dentry */
194 if (tid_fd_revalidate(dentry, 0))
195 error = NULL;
196 out:
197 return error;
198}
199
200static struct dentry *proc_lookupfd_common(struct inode *dir,
201 struct dentry *dentry,
202 instantiate_t instantiate)
203{
204 struct task_struct *task = get_proc_task(dir);
205 struct dentry *result = ERR_PTR(-ENOENT);
206 unsigned fd = name_to_int(dentry);
207
208 if (!task)
209 goto out_no_task;
210 if (fd == ~0U)
211 goto out;
212
213 result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
214out:
215 put_task_struct(task);
216out_no_task:
217 return result;
218}
219
220static int proc_readfd_common(struct file * filp, void * dirent,
221 filldir_t filldir, instantiate_t instantiate)
222{
223 struct dentry *dentry = filp->f_path.dentry;
224 struct inode *inode = dentry->d_inode;
225 struct task_struct *p = get_proc_task(inode);
226 struct files_struct *files;
227 unsigned int fd, ino;
228 int retval;
229
230 retval = -ENOENT;
231 if (!p)
232 goto out_no_task;
233 retval = 0;
234
235 fd = filp->f_pos;
236 switch (fd) {
237 case 0:
238 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
239 goto out;
240 filp->f_pos++;
241 case 1:
242 ino = parent_ino(dentry);
243 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
244 goto out;
245 filp->f_pos++;
246 default:
247 files = get_files_struct(p);
248 if (!files)
249 goto out;
250 rcu_read_lock();
251 for (fd = filp->f_pos - 2;
252 fd < files_fdtable(files)->max_fds;
253 fd++, filp->f_pos++) {
254 char name[PROC_NUMBUF];
255 int len;
256 int rv;
257
258 if (!fcheck_files(files, fd))
259 continue;
260 rcu_read_unlock();
261
262 len = snprintf(name, sizeof(name), "%d", fd);
263 rv = proc_fill_cache(filp, dirent, filldir,
264 name, len, instantiate, p,
265 (void *)(unsigned long)fd);
266 if (rv < 0)
267 goto out_fd_loop;
268 rcu_read_lock();
269 }
270 rcu_read_unlock();
271out_fd_loop:
272 put_files_struct(files);
273 }
274out:
275 put_task_struct(p);
276out_no_task:
277 return retval;
278}
279
280static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
281{
282 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
283}
284
285const struct file_operations proc_fd_operations = {
286 .read = generic_read_dir,
287 .readdir = proc_readfd,
288 .llseek = default_llseek,
289};
290
291static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
292 unsigned int flags)
293{
294 return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
295}
296
297/*
298 * /proc/pid/fd needs a special permission handler so that a process can still
299 * access /proc/self/fd after it has executed a setuid().
300 */
301int proc_fd_permission(struct inode *inode, int mask)
302{
303 int rv = generic_permission(inode, mask);
304 if (rv == 0)
305 return 0;
306 if (task_pid(current) == proc_pid(inode))
307 rv = 0;
308 return rv;
309}
310
311const struct inode_operations proc_fd_inode_operations = {
312 .lookup = proc_lookupfd,
313 .permission = proc_fd_permission,
314 .setattr = proc_setattr,
315};
316
317static struct dentry *
318proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
319 struct task_struct *task, const void *ptr)
320{
321 struct dentry *error = ERR_PTR(-ENOENT);
322 unsigned fd = (unsigned long)ptr;
323 struct proc_inode *ei;
324 struct inode *inode;
325
326 inode = proc_pid_make_inode(dir->i_sb, task);
327 if (!inode)
328 goto out;
329
330 ei = PROC_I(inode);
331 ei->fd = fd;
332
333 inode->i_mode = S_IFREG | S_IRUSR;
334 inode->i_fop = &proc_fdinfo_file_operations;
335
336 d_set_d_op(dentry, &tid_fd_dentry_operations);
337 d_add(dentry, inode);
338
339 /* Close the race of the process dying before we return the dentry */
340 if (tid_fd_revalidate(dentry, 0))
341 error = NULL;
342 out:
343 return error;
344}
345
346static struct dentry *
347proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
348{
349 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
350}
351
352static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
353{
354 return proc_readfd_common(filp, dirent, filldir,
355 proc_fdinfo_instantiate);
356}
357
358const struct inode_operations proc_fdinfo_inode_operations = {
359 .lookup = proc_lookupfdinfo,
360 .setattr = proc_setattr,
361};
362
363const struct file_operations proc_fdinfo_operations = {
364 .read = generic_read_dir,
365 .readdir = proc_readfdinfo,
366 .llseek = default_llseek,
367};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
new file mode 100644
index 000000000000..cbb1d47deda8
--- /dev/null
+++ b/fs/proc/fd.h
@@ -0,0 +1,14 @@
1#ifndef __PROCFS_FD_H__
2#define __PROCFS_FD_H__
3
4#include <linux/fs.h>
5
6extern const struct file_operations proc_fd_operations;
7extern const struct inode_operations proc_fd_inode_operations;
8
9extern const struct file_operations proc_fdinfo_operations;
10extern const struct inode_operations proc_fdinfo_inode_operations;
11
12extern int proc_fd_permission(struct inode *inode, int mask);
13
14#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b3647fe6a608..0d80cef4cfb9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
428 pde_get(de); 428 pde_get(de);
429 spin_unlock(&proc_subdir_lock); 429 spin_unlock(&proc_subdir_lock);
430 error = -EINVAL; 430 error = -ENOMEM;
431 inode = proc_get_inode(dir->i_sb, de); 431 inode = proc_get_inode(dir->i_sb, de);
432 goto out_unlock; 432 goto out_unlock;
433 } 433 }
@@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
605 unsigned int len; 605 unsigned int len;
606 606
607 /* make sure name is valid */ 607 /* make sure name is valid */
608 if (!name || !strlen(name)) goto out; 608 if (!name || !strlen(name))
609 goto out;
609 610
610 if (xlate_proc_name(name, parent, &fn) != 0) 611 if (xlate_proc_name(name, parent, &fn) != 0)
611 goto out; 612 goto out;
@@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
616 617
617 len = strlen(fn); 618 len = strlen(fn);
618 619
619 ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); 620 ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
620 if (!ent) goto out; 621 if (!ent)
622 goto out;
621 623
622 memset(ent, 0, sizeof(struct proc_dir_entry));
623 memcpy(ent->name, fn, len + 1); 624 memcpy(ent->name, fn, len + 1);
624 ent->namelen = len; 625 ent->namelen = len;
625 ent->mode = mode; 626 ent->mode = mode;
626 ent->nlink = nlink; 627 ent->nlink = nlink;
627 atomic_set(&ent->count, 1); 628 atomic_set(&ent->count, 1);
628 ent->pde_users = 0;
629 spin_lock_init(&ent->pde_unload_lock); 629 spin_lock_init(&ent->pde_unload_lock);
630 ent->pde_unload_completion = NULL;
631 INIT_LIST_HEAD(&ent->pde_openers); 630 INIT_LIST_HEAD(&ent->pde_openers);
632 out: 631out:
633 return ent; 632 return ent;
634} 633}
635 634
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ac817b64a71..3b22bbdee9ec 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
450 return NULL; 450 return NULL;
451 if (inode->i_state & I_NEW) { 451 if (inode->i_state & I_NEW) {
452 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 452 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
453 PROC_I(inode)->fd = 0;
454 PROC_I(inode)->pde = de; 453 PROC_I(inode)->pde = de;
455 454
456 if (de->mode) { 455 if (de->mode) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e1167a1c9126..cceaab07ad54 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/sched.h>
12#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
13struct ctl_table_header; 14struct ctl_table_header;
14 15
@@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations;
65extern const struct file_operations proc_pagemap_operations; 66extern const struct file_operations proc_pagemap_operations;
66extern const struct file_operations proc_net_operations; 67extern const struct file_operations proc_net_operations;
67extern const struct inode_operations proc_net_inode_operations; 68extern const struct inode_operations proc_net_inode_operations;
69extern const struct inode_operations proc_pid_link_inode_operations;
68 70
69struct proc_maps_private { 71struct proc_maps_private {
70 struct pid *pid; 72 struct pid *pid;
@@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode)
91 return PROC_I(inode)->fd; 93 return PROC_I(inode)->fd;
92} 94}
93 95
96static inline int task_dumpable(struct task_struct *task)
97{
98 int dumpable = 0;
99 struct mm_struct *mm;
100
101 task_lock(task);
102 mm = task->mm;
103 if (mm)
104 dumpable = get_dumpable(mm);
105 task_unlock(task);
106 if (dumpable == SUID_DUMPABLE_ENABLED)
107 return 1;
108 return 0;
109}
110
111static inline int pid_delete_dentry(const struct dentry * dentry)
112{
113 /* Is the task we represent dead?
114 * If so, then don't put the dentry on the lru list,
115 * kill it immediately.
116 */
117 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
118}
119
120static inline unsigned name_to_int(struct dentry *dentry)
121{
122 const char *name = dentry->d_name.name;
123 int len = dentry->d_name.len;
124 unsigned n = 0;
125
126 if (len > 1 && *name == '0')
127 goto out;
128 while (len-- > 0) {
129 unsigned c = *name++ - '0';
130 if (c > 9)
131 goto out;
132 if (n >= (~0U-9)/10)
133 goto out;
134 n *= 10;
135 n += c;
136 }
137 return n;
138out:
139 return ~0U;
140}
141
94struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, 142struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
95 struct dentry *dentry); 143 struct dentry *dentry);
96int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, 144int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d60a968..b8730d9ebaee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
115 u |= 1 << KPF_COMPOUND_TAIL; 115 u |= 1 << KPF_COMPOUND_TAIL;
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 else if (PageTransCompound(page)) 118 /*
119 * PageTransCompound can be true for non-huge compound pages (slab
120 * pages or pages allocated by drivers with __GFP_COMP) because it
121 * just checks PG_head/PG_tail, so we need to check PageLRU to make
122 * sure a given page is a thp, not a non-huge compound page.
123 */
124 else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
119 u |= 1 << KPF_THP; 125 u |= 1 << KPF_THP;
120 126
121 /* 127 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dfafeb2b05a0..a781bdf06694 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
142 } 142 }
143 143
144 rb_link_node(node, parent, p); 144 rb_link_node(node, parent, p);
145 rb_insert_color(node, &head->parent->root);
145 return 0; 146 return 0;
146} 147}
147 148
@@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head,
168 head->node = node; 169 head->node = node;
169 if (node) { 170 if (node) {
170 struct ctl_table *entry; 171 struct ctl_table *entry;
171 for (entry = table; entry->procname; entry++, node++) { 172 for (entry = table; entry->procname; entry++, node++)
172 rb_init_node(&node->node);
173 node->header = head; 173 node->header = head;
174 }
175 } 174 }
176} 175}
177 176
@@ -266,8 +265,7 @@ void sysctl_head_put(struct ctl_table_header *head)
266 265
267static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) 266static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
268{ 267{
269 if (!head) 268 BUG_ON(!head);
270 BUG();
271 spin_lock(&sysctl_lock); 269 spin_lock(&sysctl_lock);
272 if (!use_table(head)) 270 if (!use_table(head))
273 head = ERR_PTR(-ENOENT); 271 head = ERR_PTR(-ENOENT);
@@ -462,9 +460,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
462 460
463 err = ERR_PTR(-ENOMEM); 461 err = ERR_PTR(-ENOMEM);
464 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); 462 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
465 if (h)
466 sysctl_head_finish(h);
467
468 if (!inode) 463 if (!inode)
469 goto out; 464 goto out;
470 465
@@ -473,6 +468,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
473 d_add(dentry, inode); 468 d_add(dentry, inode);
474 469
475out: 470out:
471 if (h)
472 sysctl_head_finish(h);
476 sysctl_head_finish(head); 473 sysctl_head_finish(head);
477 return err; 474 return err;
478} 475}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a2d9fd7cadd..9889a92d2e01 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
61 if (!*p) 61 if (!*p)
62 continue; 62 continue;
63 63
64 args[0].to = args[0].from = 0; 64 args[0].to = args[0].from = NULL;
65 token = match_token(p, tokens, args); 65 token = match_token(p, tokens, args);
66 switch (token) { 66 switch (token) {
67 case Opt_gid: 67 case Opt_gid:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4540b8f76f16..14df8806ff29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmPTE:\t%8lu kB\n" 54 "VmPTE:\t%8lu kB\n"
55 "VmSwap:\t%8lu kB\n", 55 "VmSwap:\t%8lu kB\n",
56 hiwater_vm << (PAGE_SHIFT-10), 56 hiwater_vm << (PAGE_SHIFT-10),
57 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 57 total_vm << (PAGE_SHIFT-10),
58 mm->locked_vm << (PAGE_SHIFT-10), 58 mm->locked_vm << (PAGE_SHIFT-10),
59 mm->pinned_vm << (PAGE_SHIFT-10), 59 mm->pinned_vm << (PAGE_SHIFT-10),
60 hiwater_rss << (PAGE_SHIFT-10), 60 hiwater_rss << (PAGE_SHIFT-10),
@@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1158 struct vm_area_struct *vma = v; 1158 struct vm_area_struct *vma = v;
1159 struct numa_maps *md = &numa_priv->md; 1159 struct numa_maps *md = &numa_priv->md;
1160 struct file *file = vma->vm_file; 1160 struct file *file = vma->vm_file;
1161 struct task_struct *task = proc_priv->task;
1161 struct mm_struct *mm = vma->vm_mm; 1162 struct mm_struct *mm = vma->vm_mm;
1162 struct mm_walk walk = {}; 1163 struct mm_walk walk = {};
1163 struct mempolicy *pol; 1164 struct mempolicy *pol;
@@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1177 walk.private = md; 1178 walk.private = md;
1178 walk.mm = mm; 1179 walk.mm = mm;
1179 1180
1180 pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); 1181 task_lock(task);
1182 pol = get_vma_policy(task, vma, vma->vm_start);
1181 mpol_to_str(buffer, sizeof(buffer), pol, 0); 1183 mpol_to_str(buffer, sizeof(buffer), pol, 0);
1182 mpol_cond_put(pol); 1184 mpol_cond_put(pol);
1185 task_unlock(task);
1183 1186
1184 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1187 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1185 1188
@@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1189 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1192 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1190 seq_printf(m, " heap"); 1193 seq_printf(m, " heap");
1191 } else { 1194 } else {
1192 pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); 1195 pid_t tid = vm_is_stack(task, vma, is_pid);
1193 if (tid != 0) { 1196 if (tid != 0) {
1194 /* 1197 /*
1195 * Thread stack in /proc/PID/task/TID/maps or 1198 * Thread stack in /proc/PID/task/TID/maps or
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index d39bb5cce883..ca71db69da07 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -23,6 +23,7 @@ config PSTORE_FTRACE
23 bool "Persistent function tracer" 23 bool "Persistent function tracer"
24 depends on PSTORE 24 depends on PSTORE
25 depends on FUNCTION_TRACER 25 depends on FUNCTION_TRACER
26 depends on DEBUG_FS
26 help 27 help
27 With this option kernel traces function calls into a persistent 28 With this option kernel traces function calls into a persistent
28 ram buffer that can be decoded and dumped after reboot through 29 ram buffer that can be decoded and dumped after reboot through
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index a130d484b7d3..2d57e1ac0115 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -17,19 +17,113 @@
17#include <linux/percpu.h> 17#include <linux/percpu.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <linux/types.h>
21#include <linux/mutex.h>
22#include <linux/ftrace.h>
23#include <linux/fs.h>
24#include <linux/debugfs.h>
25#include <linux/err.h>
26#include <linux/cache.h>
20#include <asm/barrier.h> 27#include <asm/barrier.h>
21#include "internal.h" 28#include "internal.h"
22 29
23void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip) 30static void notrace pstore_ftrace_call(unsigned long ip,
31 unsigned long parent_ip)
24{ 32{
33 unsigned long flags;
25 struct pstore_ftrace_record rec = {}; 34 struct pstore_ftrace_record rec = {};
26 35
27 if (unlikely(oops_in_progress)) 36 if (unlikely(oops_in_progress))
28 return; 37 return;
29 38
39 local_irq_save(flags);
40
30 rec.ip = ip; 41 rec.ip = ip;
31 rec.parent_ip = parent_ip; 42 rec.parent_ip = parent_ip;
32 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); 43 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
33 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 44 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
34 sizeof(rec), psinfo); 45 sizeof(rec), psinfo);
46
47 local_irq_restore(flags);
48}
49
50static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
51 .func = pstore_ftrace_call,
52};
53
54static DEFINE_MUTEX(pstore_ftrace_lock);
55static bool pstore_ftrace_enabled;
56
57static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
58 size_t count, loff_t *ppos)
59{
60 u8 on;
61 ssize_t ret;
62
63 ret = kstrtou8_from_user(buf, count, 2, &on);
64 if (ret)
65 return ret;
66
67 mutex_lock(&pstore_ftrace_lock);
68
69 if (!on ^ pstore_ftrace_enabled)
70 goto out;
71
72 if (on)
73 ret = register_ftrace_function(&pstore_ftrace_ops);
74 else
75 ret = unregister_ftrace_function(&pstore_ftrace_ops);
76 if (ret) {
77 pr_err("%s: unable to %sregister ftrace ops: %zd\n",
78 __func__, on ? "" : "un", ret);
79 goto err;
80 }
81
82 pstore_ftrace_enabled = on;
83out:
84 ret = count;
85err:
86 mutex_unlock(&pstore_ftrace_lock);
87
88 return ret;
89}
90
91static ssize_t pstore_ftrace_knob_read(struct file *f, char __user *buf,
92 size_t count, loff_t *ppos)
93{
94 char val[] = { '0' + pstore_ftrace_enabled, '\n' };
95
96 return simple_read_from_buffer(buf, count, ppos, val, sizeof(val));
97}
98
99static const struct file_operations pstore_knob_fops = {
100 .open = simple_open,
101 .read = pstore_ftrace_knob_read,
102 .write = pstore_ftrace_knob_write,
103};
104
105void pstore_register_ftrace(void)
106{
107 struct dentry *dir;
108 struct dentry *file;
109
110 if (!psinfo->write_buf)
111 return;
112
113 dir = debugfs_create_dir("pstore", NULL);
114 if (!dir) {
115 pr_err("%s: unable to create pstore directory\n", __func__);
116 return;
117 }
118
119 file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
120 &pstore_knob_fops);
121 if (!file) {
122 pr_err("%s: unable to create record_ftrace file\n", __func__);
123 goto err_file;
124 }
125
126 return;
127err_file:
128 debugfs_remove(dir);
35} 129}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 0d0d3b7d5f12..4847f588b7d5 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -39,6 +39,12 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
39#endif 39#endif
40} 40}
41 41
42#ifdef CONFIG_PSTORE_FTRACE
43extern void pstore_register_ftrace(void);
44#else
45static inline void pstore_register_ftrace(void) {}
46#endif
47
42extern struct pstore_info *psinfo; 48extern struct pstore_info *psinfo;
43 49
44extern void pstore_set_kmsg_bytes(int); 50extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 29996e8793a7..a40da07e93d6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -164,7 +164,13 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
164 164
165 if (c > psinfo->bufsize) 165 if (c > psinfo->bufsize)
166 c = psinfo->bufsize; 166 c = psinfo->bufsize;
167 spin_lock_irqsave(&psinfo->buf_lock, flags); 167
168 if (oops_in_progress) {
169 if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
170 break;
171 } else {
172 spin_lock_irqsave(&psinfo->buf_lock, flags);
173 }
168 memcpy(psinfo->buf, s, c); 174 memcpy(psinfo->buf, s, c);
169 psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); 175 psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo);
170 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 176 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
@@ -236,6 +242,7 @@ int pstore_register(struct pstore_info *psi)
236 242
237 kmsg_dump_register(&pstore_dumper); 243 kmsg_dump_register(&pstore_dumper);
238 pstore_register_console(); 244 pstore_register_console();
245 pstore_register_ftrace();
239 246
240 if (pstore_update_ms >= 0) { 247 if (pstore_update_ms >= 0) {
241 pstore_timer.expires = jiffies + 248 pstore_timer.expires = jiffies +
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 0b311bc18916..1a4f6da58eab 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -32,6 +32,7 @@
32#include <linux/ioport.h> 32#include <linux/ioport.h>
33#include <linux/platform_device.h> 33#include <linux/platform_device.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/compiler.h>
35#include <linux/pstore_ram.h> 36#include <linux/pstore_ram.h>
36 37
37#define RAMOOPS_KERNMSG_HDR "====" 38#define RAMOOPS_KERNMSG_HDR "===="
@@ -181,12 +182,11 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
181 return len; 182 return len;
182} 183}
183 184
184 185static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
185static int ramoops_pstore_write_buf(enum pstore_type_id type, 186 enum kmsg_dump_reason reason,
186 enum kmsg_dump_reason reason, 187 u64 *id, unsigned int part,
187 u64 *id, unsigned int part, 188 const char *buf, size_t size,
188 const char *buf, size_t size, 189 struct pstore_info *psi)
189 struct pstore_info *psi)
190{ 190{
191 struct ramoops_context *cxt = psi->data; 191 struct ramoops_context *cxt = psi->data;
192 struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; 192 struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt];
@@ -406,7 +406,7 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
406 goto fail_init_fprz; 406 goto fail_init_fprz;
407 407
408 if (!cxt->przs && !cxt->cprz && !cxt->fprz) { 408 if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
409 pr_err("memory size too small, minimum is %lu\n", 409 pr_err("memory size too small, minimum is %zu\n",
410 cxt->console_size + cxt->record_size + 410 cxt->console_size + cxt->record_size +
411 cxt->ftrace_size); 411 cxt->ftrace_size);
412 goto fail_cnt; 412 goto fail_cnt;
@@ -414,13 +414,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
414 414
415 cxt->pstore.data = cxt; 415 cxt->pstore.data = cxt;
416 /* 416 /*
417 * Console can handle any buffer size, so prefer dumps buffer 417 * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
418 * size since usually it is smaller. 418 * have to handle dumps, we must have at least record_size buffer. And
419 * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be
420 * ZERO_SIZE_PTR).
419 */ 421 */
420 if (cxt->przs) 422 if (cxt->console_size)
421 cxt->pstore.bufsize = cxt->przs[0]->buffer_size; 423 cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
422 else 424 cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
423 cxt->pstore.bufsize = cxt->cprz->buffer_size;
424 cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); 425 cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
425 spin_lock_init(&cxt->pstore.buf_lock); 426 spin_lock_init(&cxt->pstore.buf_lock);
426 if (!cxt->pstore.buf) { 427 if (!cxt->pstore.buf) {
@@ -537,6 +538,7 @@ postcore_initcall(ramoops_init);
537static void __exit ramoops_exit(void) 538static void __exit ramoops_exit(void)
538{ 539{
539 platform_driver_unregister(&ramoops_driver); 540 platform_driver_unregister(&ramoops_driver);
541 platform_device_unregister(dummy);
540 kfree(dummy_data); 542 kfree(dummy_data);
541} 543}
542module_exit(ramoops_exit); 544module_exit(ramoops_exit);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 552e994e3aa1..43098bb5723a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -312,8 +312,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
312 (ino % QNX4_INODES_PER_BLOCK); 312 (ino % QNX4_INODES_PER_BLOCK);
313 313
314 inode->i_mode = le16_to_cpu(raw_inode->di_mode); 314 inode->i_mode = le16_to_cpu(raw_inode->di_mode);
315 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); 315 i_uid_write(inode, (uid_t)le16_to_cpu(raw_inode->di_uid));
316 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); 316 i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
317 set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); 317 set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
318 inode->i_size = le32_to_cpu(raw_inode->di_size); 318 inode->i_size = le32_to_cpu(raw_inode->di_size);
319 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); 319 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime);
@@ -391,6 +391,11 @@ static int init_inodecache(void)
391 391
392static void destroy_inodecache(void) 392static void destroy_inodecache(void)
393{ 393{
394 /*
395 * Make sure all delayed rcu free inodes are flushed before we
396 * destroy cache.
397 */
398 rcu_barrier();
394 kmem_cache_destroy(qnx4_inode_cachep); 399 kmem_cache_destroy(qnx4_inode_cachep);
395} 400}
396 401
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 2049c814bda4..b6addf560483 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -574,8 +574,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
574 raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; 574 raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs;
575 575
576 inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); 576 inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode);
577 inode->i_uid = (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid); 577 i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
578 inode->i_gid = (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid); 578 i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
579 inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); 579 inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size);
580 inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime); 580 inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime);
581 inode->i_mtime.tv_nsec = 0; 581 inode->i_mtime.tv_nsec = 0;
@@ -651,6 +651,11 @@ static int init_inodecache(void)
651 651
652static void destroy_inodecache(void) 652static void destroy_inodecache(void)
653{ 653{
654 /*
655 * Make sure all delayed rcu free inodes are flushed before we
656 * destroy cache.
657 */
658 rcu_barrier();
654 kmem_cache_destroy(qnx6_inode_cachep); 659 kmem_cache_destroy(qnx6_inode_cachep);
655} 660}
656 661
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 5f9e9e276af0..c66c37cdaa39 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -2,6 +2,6 @@ obj-$(CONFIG_QUOTA) += dquot.o
2obj-$(CONFIG_QFMT_V1) += quota_v1.o 2obj-$(CONFIG_QFMT_V1) += quota_v1.o
3obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o 4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
5obj-$(CONFIG_QUOTACTL) += quota.o 5obj-$(CONFIG_QUOTACTL) += quota.o kqid.o
6obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o 6obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
7obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o 7obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index c495a3055e2a..05ae3c97f7a5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -253,8 +253,10 @@ static qsize_t inode_get_rsv_space(struct inode *inode);
253static void __dquot_initialize(struct inode *inode, int type); 253static void __dquot_initialize(struct inode *inode, int type);
254 254
255static inline unsigned int 255static inline unsigned int
256hashfn(const struct super_block *sb, unsigned int id, int type) 256hashfn(const struct super_block *sb, struct kqid qid)
257{ 257{
258 unsigned int id = from_kqid(&init_user_ns, qid);
259 int type = qid.type;
258 unsigned long tmp; 260 unsigned long tmp;
259 261
260 tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); 262 tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
@@ -267,7 +269,7 @@ hashfn(const struct super_block *sb, unsigned int id, int type)
267static inline void insert_dquot_hash(struct dquot *dquot) 269static inline void insert_dquot_hash(struct dquot *dquot)
268{ 270{
269 struct hlist_head *head; 271 struct hlist_head *head;
270 head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type); 272 head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
271 hlist_add_head(&dquot->dq_hash, head); 273 hlist_add_head(&dquot->dq_hash, head);
272} 274}
273 275
@@ -277,15 +279,14 @@ static inline void remove_dquot_hash(struct dquot *dquot)
277} 279}
278 280
279static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, 281static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
280 unsigned int id, int type) 282 struct kqid qid)
281{ 283{
282 struct hlist_node *node; 284 struct hlist_node *node;
283 struct dquot *dquot; 285 struct dquot *dquot;
284 286
285 hlist_for_each (node, dquot_hash+hashent) { 287 hlist_for_each (node, dquot_hash+hashent) {
286 dquot = hlist_entry(node, struct dquot, dq_hash); 288 dquot = hlist_entry(node, struct dquot, dq_hash);
287 if (dquot->dq_sb == sb && dquot->dq_id == id && 289 if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
288 dquot->dq_type == type)
289 return dquot; 290 return dquot;
290 } 291 }
291 return NULL; 292 return NULL;
@@ -351,7 +352,7 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
351 spin_lock(&dq_list_lock); 352 spin_lock(&dq_list_lock);
352 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { 353 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
353 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> 354 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
354 info[dquot->dq_type].dqi_dirty_list); 355 info[dquot->dq_id.type].dqi_dirty_list);
355 ret = 0; 356 ret = 0;
356 } 357 }
357 spin_unlock(&dq_list_lock); 358 spin_unlock(&dq_list_lock);
@@ -410,17 +411,17 @@ int dquot_acquire(struct dquot *dquot)
410 mutex_lock(&dquot->dq_lock); 411 mutex_lock(&dquot->dq_lock);
411 mutex_lock(&dqopt->dqio_mutex); 412 mutex_lock(&dqopt->dqio_mutex);
412 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) 413 if (!test_bit(DQ_READ_B, &dquot->dq_flags))
413 ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot); 414 ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
414 if (ret < 0) 415 if (ret < 0)
415 goto out_iolock; 416 goto out_iolock;
416 set_bit(DQ_READ_B, &dquot->dq_flags); 417 set_bit(DQ_READ_B, &dquot->dq_flags);
417 /* Instantiate dquot if needed */ 418 /* Instantiate dquot if needed */
418 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { 419 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
419 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); 420 ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
420 /* Write the info if needed */ 421 /* Write the info if needed */
421 if (info_dirty(&dqopt->info[dquot->dq_type])) { 422 if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
422 ret2 = dqopt->ops[dquot->dq_type]->write_file_info( 423 ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
423 dquot->dq_sb, dquot->dq_type); 424 dquot->dq_sb, dquot->dq_id.type);
424 } 425 }
425 if (ret < 0) 426 if (ret < 0)
426 goto out_iolock; 427 goto out_iolock;
@@ -455,7 +456,7 @@ int dquot_commit(struct dquot *dquot)
455 /* Inactive dquot can be only if there was error during read/init 456 /* Inactive dquot can be only if there was error during read/init
456 * => we have better not writing it */ 457 * => we have better not writing it */
457 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) 458 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
458 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); 459 ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
459 else 460 else
460 ret = -EIO; 461 ret = -EIO;
461out_sem: 462out_sem:
@@ -477,12 +478,12 @@ int dquot_release(struct dquot *dquot)
477 if (atomic_read(&dquot->dq_count) > 1) 478 if (atomic_read(&dquot->dq_count) > 1)
478 goto out_dqlock; 479 goto out_dqlock;
479 mutex_lock(&dqopt->dqio_mutex); 480 mutex_lock(&dqopt->dqio_mutex);
480 if (dqopt->ops[dquot->dq_type]->release_dqblk) { 481 if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
481 ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot); 482 ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
482 /* Write the info */ 483 /* Write the info */
483 if (info_dirty(&dqopt->info[dquot->dq_type])) { 484 if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
484 ret2 = dqopt->ops[dquot->dq_type]->write_file_info( 485 ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
485 dquot->dq_sb, dquot->dq_type); 486 dquot->dq_sb, dquot->dq_id.type);
486 } 487 }
487 if (ret >= 0) 488 if (ret >= 0)
488 ret = ret2; 489 ret = ret2;
@@ -521,7 +522,7 @@ restart:
521 list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { 522 list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
522 if (dquot->dq_sb != sb) 523 if (dquot->dq_sb != sb)
523 continue; 524 continue;
524 if (dquot->dq_type != type) 525 if (dquot->dq_id.type != type)
525 continue; 526 continue;
526 /* Wait for dquot users */ 527 /* Wait for dquot users */
527 if (atomic_read(&dquot->dq_count)) { 528 if (atomic_read(&dquot->dq_count)) {
@@ -741,7 +742,8 @@ void dqput(struct dquot *dquot)
741#ifdef CONFIG_QUOTA_DEBUG 742#ifdef CONFIG_QUOTA_DEBUG
742 if (!atomic_read(&dquot->dq_count)) { 743 if (!atomic_read(&dquot->dq_count)) {
743 quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", 744 quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
744 quotatypes[dquot->dq_type], dquot->dq_id); 745 quotatypes[dquot->dq_id.type],
746 from_kqid(&init_user_ns, dquot->dq_id));
745 BUG(); 747 BUG();
746 } 748 }
747#endif 749#endif
@@ -752,7 +754,7 @@ we_slept:
752 /* We have more than one user... nothing to do */ 754 /* We have more than one user... nothing to do */
753 atomic_dec(&dquot->dq_count); 755 atomic_dec(&dquot->dq_count);
754 /* Releasing dquot during quotaoff phase? */ 756 /* Releasing dquot during quotaoff phase? */
755 if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) && 757 if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
756 atomic_read(&dquot->dq_count) == 1) 758 atomic_read(&dquot->dq_count) == 1)
757 wake_up(&dquot->dq_wait_unused); 759 wake_up(&dquot->dq_wait_unused);
758 spin_unlock(&dq_list_lock); 760 spin_unlock(&dq_list_lock);
@@ -815,7 +817,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
815 INIT_LIST_HEAD(&dquot->dq_dirty); 817 INIT_LIST_HEAD(&dquot->dq_dirty);
816 init_waitqueue_head(&dquot->dq_wait_unused); 818 init_waitqueue_head(&dquot->dq_wait_unused);
817 dquot->dq_sb = sb; 819 dquot->dq_sb = sb;
818 dquot->dq_type = type; 820 dquot->dq_id = make_kqid_invalid(type);
819 atomic_set(&dquot->dq_count, 1); 821 atomic_set(&dquot->dq_count, 1);
820 822
821 return dquot; 823 return dquot;
@@ -829,35 +831,35 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
829 * a) checking for quota flags under dq_list_lock and 831 * a) checking for quota flags under dq_list_lock and
830 * b) getting a reference to dquot before we release dq_list_lock 832 * b) getting a reference to dquot before we release dq_list_lock
831 */ 833 */
832struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 834struct dquot *dqget(struct super_block *sb, struct kqid qid)
833{ 835{
834 unsigned int hashent = hashfn(sb, id, type); 836 unsigned int hashent = hashfn(sb, qid);
835 struct dquot *dquot = NULL, *empty = NULL; 837 struct dquot *dquot = NULL, *empty = NULL;
836 838
837 if (!sb_has_quota_active(sb, type)) 839 if (!sb_has_quota_active(sb, qid.type))
838 return NULL; 840 return NULL;
839we_slept: 841we_slept:
840 spin_lock(&dq_list_lock); 842 spin_lock(&dq_list_lock);
841 spin_lock(&dq_state_lock); 843 spin_lock(&dq_state_lock);
842 if (!sb_has_quota_active(sb, type)) { 844 if (!sb_has_quota_active(sb, qid.type)) {
843 spin_unlock(&dq_state_lock); 845 spin_unlock(&dq_state_lock);
844 spin_unlock(&dq_list_lock); 846 spin_unlock(&dq_list_lock);
845 goto out; 847 goto out;
846 } 848 }
847 spin_unlock(&dq_state_lock); 849 spin_unlock(&dq_state_lock);
848 850
849 dquot = find_dquot(hashent, sb, id, type); 851 dquot = find_dquot(hashent, sb, qid);
850 if (!dquot) { 852 if (!dquot) {
851 if (!empty) { 853 if (!empty) {
852 spin_unlock(&dq_list_lock); 854 spin_unlock(&dq_list_lock);
853 empty = get_empty_dquot(sb, type); 855 empty = get_empty_dquot(sb, qid.type);
854 if (!empty) 856 if (!empty)
855 schedule(); /* Try to wait for a moment... */ 857 schedule(); /* Try to wait for a moment... */
856 goto we_slept; 858 goto we_slept;
857 } 859 }
858 dquot = empty; 860 dquot = empty;
859 empty = NULL; 861 empty = NULL;
860 dquot->dq_id = id; 862 dquot->dq_id = qid;
861 /* all dquots go on the inuse_list */ 863 /* all dquots go on the inuse_list */
862 put_inuse(dquot); 864 put_inuse(dquot);
863 /* hash it first so it can be found */ 865 /* hash it first so it can be found */
@@ -1129,8 +1131,7 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
1129 1131
1130struct dquot_warn { 1132struct dquot_warn {
1131 struct super_block *w_sb; 1133 struct super_block *w_sb;
1132 qid_t w_dq_id; 1134 struct kqid w_dq_id;
1133 short w_dq_type;
1134 short w_type; 1135 short w_type;
1135}; 1136};
1136 1137
@@ -1154,11 +1155,13 @@ static int need_print_warning(struct dquot_warn *warn)
1154 if (!flag_print_warnings) 1155 if (!flag_print_warnings)
1155 return 0; 1156 return 0;
1156 1157
1157 switch (warn->w_dq_type) { 1158 switch (warn->w_dq_id.type) {
1158 case USRQUOTA: 1159 case USRQUOTA:
1159 return current_fsuid() == warn->w_dq_id; 1160 return uid_eq(current_fsuid(), warn->w_dq_id.uid);
1160 case GRPQUOTA: 1161 case GRPQUOTA:
1161 return in_group_p(warn->w_dq_id); 1162 return in_group_p(warn->w_dq_id.gid);
1163 case PRJQUOTA: /* Never taken... Just make gcc happy */
1164 return 0;
1162 } 1165 }
1163 return 0; 1166 return 0;
1164} 1167}
@@ -1184,7 +1187,7 @@ static void print_warning(struct dquot_warn *warn)
1184 tty_write_message(tty, ": warning, "); 1187 tty_write_message(tty, ": warning, ");
1185 else 1188 else
1186 tty_write_message(tty, ": write failed, "); 1189 tty_write_message(tty, ": write failed, ");
1187 tty_write_message(tty, quotatypes[warn->w_dq_type]); 1190 tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
1188 switch (warntype) { 1191 switch (warntype) {
1189 case QUOTA_NL_IHARDWARN: 1192 case QUOTA_NL_IHARDWARN:
1190 msg = " file limit reached.\r\n"; 1193 msg = " file limit reached.\r\n";
@@ -1218,7 +1221,6 @@ static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
1218 warn->w_type = warntype; 1221 warn->w_type = warntype;
1219 warn->w_sb = dquot->dq_sb; 1222 warn->w_sb = dquot->dq_sb;
1220 warn->w_dq_id = dquot->dq_id; 1223 warn->w_dq_id = dquot->dq_id;
1221 warn->w_dq_type = dquot->dq_type;
1222} 1224}
1223 1225
1224/* 1226/*
@@ -1236,14 +1238,14 @@ static void flush_warnings(struct dquot_warn *warn)
1236#ifdef CONFIG_PRINT_QUOTA_WARNING 1238#ifdef CONFIG_PRINT_QUOTA_WARNING
1237 print_warning(&warn[i]); 1239 print_warning(&warn[i]);
1238#endif 1240#endif
1239 quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id, 1241 quota_send_warning(warn[i].w_dq_id,
1240 warn[i].w_sb->s_dev, warn[i].w_type); 1242 warn[i].w_sb->s_dev, warn[i].w_type);
1241 } 1243 }
1242} 1244}
1243 1245
1244static int ignore_hardlimit(struct dquot *dquot) 1246static int ignore_hardlimit(struct dquot *dquot)
1245{ 1247{
1246 struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 1248 struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
1247 1249
1248 return capable(CAP_SYS_RESOURCE) && 1250 return capable(CAP_SYS_RESOURCE) &&
1249 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || 1251 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
@@ -1256,7 +1258,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
1256{ 1258{
1257 qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; 1259 qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
1258 1260
1259 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1261 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
1260 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1262 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1261 return 0; 1263 return 0;
1262 1264
@@ -1281,7 +1283,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
1281 dquot->dq_dqb.dqb_itime == 0) { 1283 dquot->dq_dqb.dqb_itime == 0) {
1282 prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); 1284 prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
1283 dquot->dq_dqb.dqb_itime = get_seconds() + 1285 dquot->dq_dqb.dqb_itime = get_seconds() +
1284 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1286 sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
1285 } 1287 }
1286 1288
1287 return 0; 1289 return 0;
@@ -1294,7 +1296,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
1294 qsize_t tspace; 1296 qsize_t tspace;
1295 struct super_block *sb = dquot->dq_sb; 1297 struct super_block *sb = dquot->dq_sb;
1296 1298
1297 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || 1299 if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
1298 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1300 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1299 return 0; 1301 return 0;
1300 1302
@@ -1325,7 +1327,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
1325 if (!prealloc) { 1327 if (!prealloc) {
1326 prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); 1328 prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
1327 dquot->dq_dqb.dqb_btime = get_seconds() + 1329 dquot->dq_dqb.dqb_btime = get_seconds() +
1328 sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; 1330 sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
1329 } 1331 }
1330 else 1332 else
1331 /* 1333 /*
@@ -1344,7 +1346,7 @@ static int info_idq_free(struct dquot *dquot, qsize_t inodes)
1344 1346
1345 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1347 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1346 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || 1348 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
1347 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) 1349 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
1348 return QUOTA_NL_NOWARN; 1350 return QUOTA_NL_NOWARN;
1349 1351
1350 newinodes = dquot->dq_dqb.dqb_curinodes - inodes; 1352 newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
@@ -1390,7 +1392,6 @@ static int dquot_active(const struct inode *inode)
1390 */ 1392 */
1391static void __dquot_initialize(struct inode *inode, int type) 1393static void __dquot_initialize(struct inode *inode, int type)
1392{ 1394{
1393 unsigned int id = 0;
1394 int cnt; 1395 int cnt;
1395 struct dquot *got[MAXQUOTAS]; 1396 struct dquot *got[MAXQUOTAS];
1396 struct super_block *sb = inode->i_sb; 1397 struct super_block *sb = inode->i_sb;
@@ -1403,18 +1404,19 @@ static void __dquot_initialize(struct inode *inode, int type)
1403 1404
1404 /* First get references to structures we might need. */ 1405 /* First get references to structures we might need. */
1405 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1406 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1407 struct kqid qid;
1406 got[cnt] = NULL; 1408 got[cnt] = NULL;
1407 if (type != -1 && cnt != type) 1409 if (type != -1 && cnt != type)
1408 continue; 1410 continue;
1409 switch (cnt) { 1411 switch (cnt) {
1410 case USRQUOTA: 1412 case USRQUOTA:
1411 id = inode->i_uid; 1413 qid = make_kqid_uid(inode->i_uid);
1412 break; 1414 break;
1413 case GRPQUOTA: 1415 case GRPQUOTA:
1414 id = inode->i_gid; 1416 qid = make_kqid_gid(inode->i_gid);
1415 break; 1417 break;
1416 } 1418 }
1417 got[cnt] = dqget(sb, id, cnt); 1419 got[cnt] = dqget(sb, qid);
1418 } 1420 }
1419 1421
1420 down_write(&sb_dqopt(sb)->dqptr_sem); 1422 down_write(&sb_dqopt(sb)->dqptr_sem);
@@ -1897,10 +1899,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1897 if (!dquot_active(inode)) 1899 if (!dquot_active(inode))
1898 return 0; 1900 return 0;
1899 1901
1900 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1902 if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid))
1901 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); 1903 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid));
1902 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) 1904 if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))
1903 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA); 1905 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid));
1904 1906
1905 ret = __dquot_transfer(inode, transfer_to); 1907 ret = __dquot_transfer(inode, transfer_to);
1906 dqput_all(transfer_to); 1908 dqput_all(transfer_to);
@@ -2360,9 +2362,9 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2360 2362
2361 memset(di, 0, sizeof(*di)); 2363 memset(di, 0, sizeof(*di));
2362 di->d_version = FS_DQUOT_VERSION; 2364 di->d_version = FS_DQUOT_VERSION;
2363 di->d_flags = dquot->dq_type == USRQUOTA ? 2365 di->d_flags = dquot->dq_id.type == USRQUOTA ?
2364 FS_USER_QUOTA : FS_GROUP_QUOTA; 2366 FS_USER_QUOTA : FS_GROUP_QUOTA;
2365 di->d_id = dquot->dq_id; 2367 di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
2366 2368
2367 spin_lock(&dq_data_lock); 2369 spin_lock(&dq_data_lock);
2368 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); 2370 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
@@ -2376,12 +2378,12 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2376 spin_unlock(&dq_data_lock); 2378 spin_unlock(&dq_data_lock);
2377} 2379}
2378 2380
2379int dquot_get_dqblk(struct super_block *sb, int type, qid_t id, 2381int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
2380 struct fs_disk_quota *di) 2382 struct fs_disk_quota *di)
2381{ 2383{
2382 struct dquot *dquot; 2384 struct dquot *dquot;
2383 2385
2384 dquot = dqget(sb, id, type); 2386 dquot = dqget(sb, qid);
2385 if (!dquot) 2387 if (!dquot)
2386 return -ESRCH; 2388 return -ESRCH;
2387 do_get_dqblk(dquot, di); 2389 do_get_dqblk(dquot, di);
@@ -2401,7 +2403,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2401{ 2403{
2402 struct mem_dqblk *dm = &dquot->dq_dqb; 2404 struct mem_dqblk *dm = &dquot->dq_dqb;
2403 int check_blim = 0, check_ilim = 0; 2405 int check_blim = 0, check_ilim = 0;
2404 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 2406 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
2405 2407
2406 if (di->d_fieldmask & ~VFS_FS_DQ_MASK) 2408 if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
2407 return -EINVAL; 2409 return -EINVAL;
@@ -2488,13 +2490,13 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2488 return 0; 2490 return 0;
2489} 2491}
2490 2492
2491int dquot_set_dqblk(struct super_block *sb, int type, qid_t id, 2493int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
2492 struct fs_disk_quota *di) 2494 struct fs_disk_quota *di)
2493{ 2495{
2494 struct dquot *dquot; 2496 struct dquot *dquot;
2495 int rc; 2497 int rc;
2496 2498
2497 dquot = dqget(sb, id, type); 2499 dquot = dqget(sb, qid);
2498 if (!dquot) { 2500 if (!dquot) {
2499 rc = -ESRCH; 2501 rc = -ESRCH;
2500 goto out; 2502 goto out;
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
new file mode 100644
index 000000000000..2f97b0e2c501
--- /dev/null
+++ b/fs/quota/kqid.c
@@ -0,0 +1,132 @@
1#include <linux/fs.h>
2#include <linux/quota.h>
3#include <linux/export.h>
4
5/**
6 * qid_eq - Test to see if to kquid values are the same
7 * @left: A qid value
8 * @right: Another quid value
9 *
10 * Return true if the two qid values are equal and false otherwise.
11 */
12bool qid_eq(struct kqid left, struct kqid right)
13{
14 if (left.type != right.type)
15 return false;
16 switch(left.type) {
17 case USRQUOTA:
18 return uid_eq(left.uid, right.uid);
19 case GRPQUOTA:
20 return gid_eq(left.gid, right.gid);
21 case PRJQUOTA:
22 return projid_eq(left.projid, right.projid);
23 default:
24 BUG();
25 }
26}
27EXPORT_SYMBOL(qid_eq);
28
29/**
30 * qid_lt - Test to see if one qid value is less than another
31 * @left: The possibly lesser qid value
32 * @right: The possibly greater qid value
33 *
34 * Return true if left is less than right and false otherwise.
35 */
36bool qid_lt(struct kqid left, struct kqid right)
37{
38 if (left.type < right.type)
39 return true;
40 if (left.type > right.type)
41 return false;
42 switch (left.type) {
43 case USRQUOTA:
44 return uid_lt(left.uid, right.uid);
45 case GRPQUOTA:
46 return gid_lt(left.gid, right.gid);
47 case PRJQUOTA:
48 return projid_lt(left.projid, right.projid);
49 default:
50 BUG();
51 }
52}
53EXPORT_SYMBOL(qid_lt);
54
55/**
56 * from_kqid - Create a qid from a kqid user-namespace pair.
57 * @targ: The user namespace we want a qid in.
58 * @kuid: The kernel internal quota identifier to start with.
59 *
60 * Map @kqid into the user-namespace specified by @targ and
61 * return the resulting qid.
62 *
63 * There is always a mapping into the initial user_namespace.
64 *
65 * If @kqid has no mapping in @targ (qid_t)-1 is returned.
66 */
67qid_t from_kqid(struct user_namespace *targ, struct kqid kqid)
68{
69 switch (kqid.type) {
70 case USRQUOTA:
71 return from_kuid(targ, kqid.uid);
72 case GRPQUOTA:
73 return from_kgid(targ, kqid.gid);
74 case PRJQUOTA:
75 return from_kprojid(targ, kqid.projid);
76 default:
77 BUG();
78 }
79}
80EXPORT_SYMBOL(from_kqid);
81
82/**
83 * from_kqid_munged - Create a qid from a kqid user-namespace pair.
84 * @targ: The user namespace we want a qid in.
85 * @kqid: The kernel internal quota identifier to start with.
86 *
87 * Map @kqid into the user-namespace specified by @targ and
88 * return the resulting qid.
89 *
90 * There is always a mapping into the initial user_namespace.
91 *
92 * Unlike from_kqid from_kqid_munged never fails and always
93 * returns a valid projid. This makes from_kqid_munged
94 * appropriate for use in places where failing to provide
95 * a qid_t is not a good option.
96 *
97 * If @kqid has no mapping in @targ the kqid.type specific
98 * overflow identifier is returned.
99 */
100qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid)
101{
102 switch (kqid.type) {
103 case USRQUOTA:
104 return from_kuid_munged(targ, kqid.uid);
105 case GRPQUOTA:
106 return from_kgid_munged(targ, kqid.gid);
107 case PRJQUOTA:
108 return from_kprojid_munged(targ, kqid.projid);
109 default:
110 BUG();
111 }
112}
113EXPORT_SYMBOL(from_kqid_munged);
114
115/**
116 * qid_valid - Report if a valid value is stored in a kqid.
117 * @qid: The kernel internal quota identifier to test.
118 */
119bool qid_valid(struct kqid qid)
120{
121 switch (qid.type) {
122 case USRQUOTA:
123 return uid_valid(qid.uid);
124 case GRPQUOTA:
125 return gid_valid(qid.gid);
126 case PRJQUOTA:
127 return projid_valid(qid.projid);
128 default:
129 BUG();
130 }
131}
132EXPORT_SYMBOL(qid_valid);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index d67908b407d9..16e8abb7709b 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -30,7 +30,7 @@ static struct genl_family quota_genl_family = {
30 * 30 *
31 */ 31 */
32 32
33void quota_send_warning(short type, unsigned int id, dev_t dev, 33void quota_send_warning(struct kqid qid, dev_t dev,
34 const char warntype) 34 const char warntype)
35{ 35{
36 static atomic_t seq; 36 static atomic_t seq;
@@ -56,10 +56,11 @@ void quota_send_warning(short type, unsigned int id, dev_t dev,
56 "VFS: Cannot store netlink header in quota warning.\n"); 56 "VFS: Cannot store netlink header in quota warning.\n");
57 goto err_out; 57 goto err_out;
58 } 58 }
59 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); 59 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type);
60 if (ret) 60 if (ret)
61 goto attr_err_out; 61 goto attr_err_out;
62 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); 62 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID,
63 from_kqid_munged(&init_user_ns, qid));
63 if (ret) 64 if (ret)
64 goto attr_err_out; 65 goto attr_err_out;
65 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); 66 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
@@ -71,7 +72,8 @@ void quota_send_warning(short type, unsigned int id, dev_t dev,
71 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); 72 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
72 if (ret) 73 if (ret)
73 goto attr_err_out; 74 goto attr_err_out;
74 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); 75 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID,
76 from_kuid_munged(&init_user_ns, current_uid()));
75 if (ret) 77 if (ret)
76 goto attr_err_out; 78 goto attr_err_out;
77 genlmsg_end(skb, msg_head); 79 genlmsg_end(skb, msg_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 6f155788cbc6..af1661f7a54f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -32,8 +32,8 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
32 /* allow to query information for dquots we "own" */ 32 /* allow to query information for dquots we "own" */
33 case Q_GETQUOTA: 33 case Q_GETQUOTA:
34 case Q_XGETQUOTA: 34 case Q_XGETQUOTA:
35 if ((type == USRQUOTA && current_euid() == id) || 35 if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
36 (type == GRPQUOTA && in_egroup_p(id))) 36 (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
37 break; 37 break;
38 /*FALLTHROUGH*/ 38 /*FALLTHROUGH*/
39 default: 39 default:
@@ -130,13 +130,17 @@ static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
130static int quota_getquota(struct super_block *sb, int type, qid_t id, 130static int quota_getquota(struct super_block *sb, int type, qid_t id,
131 void __user *addr) 131 void __user *addr)
132{ 132{
133 struct kqid qid;
133 struct fs_disk_quota fdq; 134 struct fs_disk_quota fdq;
134 struct if_dqblk idq; 135 struct if_dqblk idq;
135 int ret; 136 int ret;
136 137
137 if (!sb->s_qcop->get_dqblk) 138 if (!sb->s_qcop->get_dqblk)
138 return -ENOSYS; 139 return -ENOSYS;
139 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); 140 qid = make_kqid(current_user_ns(), type, id);
141 if (!qid_valid(qid))
142 return -EINVAL;
143 ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
140 if (ret) 144 if (ret)
141 return ret; 145 return ret;
142 copy_to_if_dqblk(&idq, &fdq); 146 copy_to_if_dqblk(&idq, &fdq);
@@ -176,13 +180,17 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
176{ 180{
177 struct fs_disk_quota fdq; 181 struct fs_disk_quota fdq;
178 struct if_dqblk idq; 182 struct if_dqblk idq;
183 struct kqid qid;
179 184
180 if (copy_from_user(&idq, addr, sizeof(idq))) 185 if (copy_from_user(&idq, addr, sizeof(idq)))
181 return -EFAULT; 186 return -EFAULT;
182 if (!sb->s_qcop->set_dqblk) 187 if (!sb->s_qcop->set_dqblk)
183 return -ENOSYS; 188 return -ENOSYS;
189 qid = make_kqid(current_user_ns(), type, id);
190 if (!qid_valid(qid))
191 return -EINVAL;
184 copy_from_if_dqblk(&fdq, &idq); 192 copy_from_if_dqblk(&fdq, &idq);
185 return sb->s_qcop->set_dqblk(sb, type, id, &fdq); 193 return sb->s_qcop->set_dqblk(sb, qid, &fdq);
186} 194}
187 195
188static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 196static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -213,23 +221,31 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
213 void __user *addr) 221 void __user *addr)
214{ 222{
215 struct fs_disk_quota fdq; 223 struct fs_disk_quota fdq;
224 struct kqid qid;
216 225
217 if (copy_from_user(&fdq, addr, sizeof(fdq))) 226 if (copy_from_user(&fdq, addr, sizeof(fdq)))
218 return -EFAULT; 227 return -EFAULT;
219 if (!sb->s_qcop->set_dqblk) 228 if (!sb->s_qcop->set_dqblk)
220 return -ENOSYS; 229 return -ENOSYS;
221 return sb->s_qcop->set_dqblk(sb, type, id, &fdq); 230 qid = make_kqid(current_user_ns(), type, id);
231 if (!qid_valid(qid))
232 return -EINVAL;
233 return sb->s_qcop->set_dqblk(sb, qid, &fdq);
222} 234}
223 235
224static int quota_getxquota(struct super_block *sb, int type, qid_t id, 236static int quota_getxquota(struct super_block *sb, int type, qid_t id,
225 void __user *addr) 237 void __user *addr)
226{ 238{
227 struct fs_disk_quota fdq; 239 struct fs_disk_quota fdq;
240 struct kqid qid;
228 int ret; 241 int ret;
229 242
230 if (!sb->s_qcop->get_dqblk) 243 if (!sb->s_qcop->get_dqblk)
231 return -ENOSYS; 244 return -ENOSYS;
232 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); 245 qid = make_kqid(current_user_ns(), type, id);
246 if (!qid_valid(qid))
247 return -EINVAL;
248 ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
233 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 249 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
234 return -EFAULT; 250 return -EFAULT;
235 return ret; 251 return ret;
@@ -315,11 +331,11 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
315#ifdef CONFIG_BLOCK 331#ifdef CONFIG_BLOCK
316 struct block_device *bdev; 332 struct block_device *bdev;
317 struct super_block *sb; 333 struct super_block *sb;
318 char *tmp = getname(special); 334 struct filename *tmp = getname(special);
319 335
320 if (IS_ERR(tmp)) 336 if (IS_ERR(tmp))
321 return ERR_CAST(tmp); 337 return ERR_CAST(tmp);
322 bdev = lookup_bdev(tmp); 338 bdev = lookup_bdev(tmp->name);
323 putname(tmp); 339 putname(tmp);
324 if (IS_ERR(bdev)) 340 if (IS_ERR(bdev))
325 return ERR_CAST(bdev); 341 return ERR_CAST(bdev);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index e41c1becf096..d65877fbe8f4 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,9 +22,10 @@ MODULE_LICENSE("GPL");
22 22
23#define __QUOTA_QT_PARANOIA 23#define __QUOTA_QT_PARANOIA
24 24
25static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) 25static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
26{ 26{
27 unsigned int epb = info->dqi_usable_bs >> 2; 27 unsigned int epb = info->dqi_usable_bs >> 2;
28 qid_t id = from_kqid(&init_user_ns, qid);
28 29
29 depth = info->dqi_qtree_depth - depth - 1; 30 depth = info->dqi_qtree_depth - depth - 1;
30 while (depth--) 31 while (depth--)
@@ -244,7 +245,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
244 /* This is enough as the block is already zeroed and the entry 245 /* This is enough as the block is already zeroed and the entry
245 * list is empty... */ 246 * list is empty... */
246 info->dqi_free_entry = blk; 247 info->dqi_free_entry = blk;
247 mark_info_dirty(dquot->dq_sb, dquot->dq_type); 248 mark_info_dirty(dquot->dq_sb, dquot->dq_id.type);
248 } 249 }
249 /* Block will be full? */ 250 /* Block will be full? */
250 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 251 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
@@ -357,7 +358,7 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
357 */ 358 */
358int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) 359int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
359{ 360{
360 int type = dquot->dq_type; 361 int type = dquot->dq_id.type;
361 struct super_block *sb = dquot->dq_sb; 362 struct super_block *sb = dquot->dq_sb;
362 ssize_t ret; 363 ssize_t ret;
363 char *ddquot = getdqbuf(info->dqi_entry_size); 364 char *ddquot = getdqbuf(info->dqi_entry_size);
@@ -538,8 +539,9 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
538 ddquot += info->dqi_entry_size; 539 ddquot += info->dqi_entry_size;
539 } 540 }
540 if (i == qtree_dqstr_in_blk(info)) { 541 if (i == qtree_dqstr_in_blk(info)) {
541 quota_error(dquot->dq_sb, "Quota for id %u referenced " 542 quota_error(dquot->dq_sb,
542 "but not present", dquot->dq_id); 543 "Quota for id %u referenced but not present",
544 from_kqid(&init_user_ns, dquot->dq_id));
543 ret = -EIO; 545 ret = -EIO;
544 goto out_buf; 546 goto out_buf;
545 } else { 547 } else {
@@ -589,7 +591,7 @@ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
589 591
590int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) 592int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
591{ 593{
592 int type = dquot->dq_type; 594 int type = dquot->dq_id.type;
593 struct super_block *sb = dquot->dq_sb; 595 struct super_block *sb = dquot->dq_sb;
594 loff_t offset; 596 loff_t offset;
595 char *ddquot; 597 char *ddquot;
@@ -607,8 +609,10 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
607 offset = find_dqentry(info, dquot); 609 offset = find_dqentry(info, dquot);
608 if (offset <= 0) { /* Entry not present? */ 610 if (offset <= 0) { /* Entry not present? */
609 if (offset < 0) 611 if (offset < 0)
610 quota_error(sb, "Can't read quota structure " 612 quota_error(sb,"Can't read quota structure "
611 "for id %u", dquot->dq_id); 613 "for id %u",
614 from_kqid(&init_user_ns,
615 dquot->dq_id));
612 dquot->dq_off = 0; 616 dquot->dq_off = 0;
613 set_bit(DQ_FAKE_B, &dquot->dq_flags); 617 set_bit(DQ_FAKE_B, &dquot->dq_flags);
614 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 618 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -626,7 +630,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
626 if (ret >= 0) 630 if (ret >= 0)
627 ret = -EIO; 631 ret = -EIO;
628 quota_error(sb, "Error while reading quota structure for id %u", 632 quota_error(sb, "Error while reading quota structure for id %u",
629 dquot->dq_id); 633 from_kqid(&init_user_ns, dquot->dq_id));
630 set_bit(DQ_FAKE_B, &dquot->dq_flags); 634 set_bit(DQ_FAKE_B, &dquot->dq_flags);
631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 635 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
632 kfree(ddquot); 636 kfree(ddquot);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 34b37a67bb16..469c6848b322 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -54,7 +54,7 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
54 54
55static int v1_read_dqblk(struct dquot *dquot) 55static int v1_read_dqblk(struct dquot *dquot)
56{ 56{
57 int type = dquot->dq_type; 57 int type = dquot->dq_id.type;
58 struct v1_disk_dqblk dqblk; 58 struct v1_disk_dqblk dqblk;
59 59
60 if (!sb_dqopt(dquot->dq_sb)->files[type]) 60 if (!sb_dqopt(dquot->dq_sb)->files[type])
@@ -63,7 +63,8 @@ static int v1_read_dqblk(struct dquot *dquot)
63 /* Set structure to 0s in case read fails/is after end of file */ 63 /* Set structure to 0s in case read fails/is after end of file */
64 memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); 64 memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
65 dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, 65 dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
66 sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); 66 sizeof(struct v1_disk_dqblk),
67 v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id)));
67 68
68 v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); 69 v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
69 if (dquot->dq_dqb.dqb_bhardlimit == 0 && 70 if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
@@ -78,12 +79,13 @@ static int v1_read_dqblk(struct dquot *dquot)
78 79
79static int v1_commit_dqblk(struct dquot *dquot) 80static int v1_commit_dqblk(struct dquot *dquot)
80{ 81{
81 short type = dquot->dq_type; 82 short type = dquot->dq_id.type;
82 ssize_t ret; 83 ssize_t ret;
83 struct v1_disk_dqblk dqblk; 84 struct v1_disk_dqblk dqblk;
84 85
85 v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); 86 v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
86 if (dquot->dq_id == 0) { 87 if (((type == USRQUOTA) && uid_eq(dquot->dq_id.uid, GLOBAL_ROOT_UID)) ||
88 ((type == GRPQUOTA) && gid_eq(dquot->dq_id.gid, GLOBAL_ROOT_GID))) {
87 dqblk.dqb_btime = 89 dqblk.dqb_btime =
88 sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; 90 sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
89 dqblk.dqb_itime = 91 dqblk.dqb_itime =
@@ -93,7 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
93 if (sb_dqopt(dquot->dq_sb)->files[type]) 95 if (sb_dqopt(dquot->dq_sb)->files[type])
94 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, 96 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
95 (char *)&dqblk, sizeof(struct v1_disk_dqblk), 97 (char *)&dqblk, sizeof(struct v1_disk_dqblk),
96 v1_dqoff(dquot->dq_id)); 98 v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id)));
97 if (ret != sizeof(struct v1_disk_dqblk)) { 99 if (ret != sizeof(struct v1_disk_dqblk)) {
98 quota_error(dquot->dq_sb, "dquota write failed"); 100 quota_error(dquot->dq_sb, "dquota write failed");
99 if (ret >= 0) 101 if (ret >= 0)
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index f1ab3604db5a..02751ec695c5 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -196,7 +196,7 @@ static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
196 struct v2r0_disk_dqblk *d = dp; 196 struct v2r0_disk_dqblk *d = dp;
197 struct mem_dqblk *m = &dquot->dq_dqb; 197 struct mem_dqblk *m = &dquot->dq_dqb;
198 struct qtree_mem_dqinfo *info = 198 struct qtree_mem_dqinfo *info =
199 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 199 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
200 200
201 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); 201 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
202 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); 202 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
@@ -206,7 +206,7 @@ static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
206 d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); 206 d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
207 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 207 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
208 d->dqb_btime = cpu_to_le64(m->dqb_btime); 208 d->dqb_btime = cpu_to_le64(m->dqb_btime);
209 d->dqb_id = cpu_to_le32(dquot->dq_id); 209 d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
210 if (qtree_entry_unused(info, dp)) 210 if (qtree_entry_unused(info, dp))
211 d->dqb_itime = cpu_to_le64(1); 211 d->dqb_itime = cpu_to_le64(1);
212} 212}
@@ -215,11 +215,13 @@ static int v2r0_is_id(void *dp, struct dquot *dquot)
215{ 215{
216 struct v2r0_disk_dqblk *d = dp; 216 struct v2r0_disk_dqblk *d = dp;
217 struct qtree_mem_dqinfo *info = 217 struct qtree_mem_dqinfo *info =
218 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 218 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
219 219
220 if (qtree_entry_unused(info, dp)) 220 if (qtree_entry_unused(info, dp))
221 return 0; 221 return 0;
222 return le32_to_cpu(d->dqb_id) == dquot->dq_id; 222 return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
223 le32_to_cpu(d->dqb_id)),
224 dquot->dq_id);
223} 225}
224 226
225static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) 227static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
@@ -247,7 +249,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
247 struct v2r1_disk_dqblk *d = dp; 249 struct v2r1_disk_dqblk *d = dp;
248 struct mem_dqblk *m = &dquot->dq_dqb; 250 struct mem_dqblk *m = &dquot->dq_dqb;
249 struct qtree_mem_dqinfo *info = 251 struct qtree_mem_dqinfo *info =
250 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 252 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
251 253
252 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); 254 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
253 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); 255 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
@@ -257,7 +259,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
257 d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); 259 d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
258 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 260 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
259 d->dqb_btime = cpu_to_le64(m->dqb_btime); 261 d->dqb_btime = cpu_to_le64(m->dqb_btime);
260 d->dqb_id = cpu_to_le32(dquot->dq_id); 262 d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
261 if (qtree_entry_unused(info, dp)) 263 if (qtree_entry_unused(info, dp))
262 d->dqb_itime = cpu_to_le64(1); 264 d->dqb_itime = cpu_to_le64(1);
263} 265}
@@ -266,26 +268,28 @@ static int v2r1_is_id(void *dp, struct dquot *dquot)
266{ 268{
267 struct v2r1_disk_dqblk *d = dp; 269 struct v2r1_disk_dqblk *d = dp;
268 struct qtree_mem_dqinfo *info = 270 struct qtree_mem_dqinfo *info =
269 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 271 sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
270 272
271 if (qtree_entry_unused(info, dp)) 273 if (qtree_entry_unused(info, dp))
272 return 0; 274 return 0;
273 return le32_to_cpu(d->dqb_id) == dquot->dq_id; 275 return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
276 le32_to_cpu(d->dqb_id)),
277 dquot->dq_id);
274} 278}
275 279
276static int v2_read_dquot(struct dquot *dquot) 280static int v2_read_dquot(struct dquot *dquot)
277{ 281{
278 return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); 282 return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
279} 283}
280 284
281static int v2_write_dquot(struct dquot *dquot) 285static int v2_write_dquot(struct dquot *dquot)
282{ 286{
283 return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); 287 return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
284} 288}
285 289
286static int v2_release_dquot(struct dquot *dquot) 290static int v2_release_dquot(struct dquot *dquot)
287{ 291{
288 return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); 292 return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
289} 293}
290 294
291static int v2_free_file_info(struct super_block *sb, int type) 295static int v2_free_file_info(struct super_block *sb, int type)
diff --git a/fs/read_write.c b/fs/read_write.c
index 1adfb691e4f1..d06534857e9e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -232,23 +232,18 @@ EXPORT_SYMBOL(vfs_llseek);
232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) 232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
233{ 233{
234 off_t retval; 234 off_t retval;
235 struct file * file; 235 struct fd f = fdget(fd);
236 int fput_needed; 236 if (!f.file)
237 237 return -EBADF;
238 retval = -EBADF;
239 file = fget_light(fd, &fput_needed);
240 if (!file)
241 goto bad;
242 238
243 retval = -EINVAL; 239 retval = -EINVAL;
244 if (origin <= SEEK_MAX) { 240 if (origin <= SEEK_MAX) {
245 loff_t res = vfs_llseek(file, offset, origin); 241 loff_t res = vfs_llseek(f.file, offset, origin);
246 retval = res; 242 retval = res;
247 if (res != (loff_t)retval) 243 if (res != (loff_t)retval)
248 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
249 } 245 }
250 fput_light(file, fput_needed); 246 fdput(f);
251bad:
252 return retval; 247 return retval;
253} 248}
254 249
@@ -258,20 +253,17 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
258 unsigned int, origin) 253 unsigned int, origin)
259{ 254{
260 int retval; 255 int retval;
261 struct file * file; 256 struct fd f = fdget(fd);
262 loff_t offset; 257 loff_t offset;
263 int fput_needed;
264 258
265 retval = -EBADF; 259 if (!f.file)
266 file = fget_light(fd, &fput_needed); 260 return -EBADF;
267 if (!file)
268 goto bad;
269 261
270 retval = -EINVAL; 262 retval = -EINVAL;
271 if (origin > SEEK_MAX) 263 if (origin > SEEK_MAX)
272 goto out_putf; 264 goto out_putf;
273 265
274 offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, 266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
275 origin); 267 origin);
276 268
277 retval = (int)offset; 269 retval = (int)offset;
@@ -281,8 +273,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
281 retval = 0; 273 retval = 0;
282 } 274 }
283out_putf: 275out_putf:
284 fput_light(file, fput_needed); 276 fdput(f);
285bad:
286 return retval; 277 return retval;
287} 278}
288#endif 279#endif
@@ -461,34 +452,29 @@ static inline void file_pos_write(struct file *file, loff_t pos)
461 452
462SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 453SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
463{ 454{
464 struct file *file; 455 struct fd f = fdget(fd);
465 ssize_t ret = -EBADF; 456 ssize_t ret = -EBADF;
466 int fput_needed;
467 457
468 file = fget_light(fd, &fput_needed); 458 if (f.file) {
469 if (file) { 459 loff_t pos = file_pos_read(f.file);
470 loff_t pos = file_pos_read(file); 460 ret = vfs_read(f.file, buf, count, &pos);
471 ret = vfs_read(file, buf, count, &pos); 461 file_pos_write(f.file, pos);
472 file_pos_write(file, pos); 462 fdput(f);
473 fput_light(file, fput_needed);
474 } 463 }
475
476 return ret; 464 return ret;
477} 465}
478 466
479SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 467SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
480 size_t, count) 468 size_t, count)
481{ 469{
482 struct file *file; 470 struct fd f = fdget(fd);
483 ssize_t ret = -EBADF; 471 ssize_t ret = -EBADF;
484 int fput_needed;
485 472
486 file = fget_light(fd, &fput_needed); 473 if (f.file) {
487 if (file) { 474 loff_t pos = file_pos_read(f.file);
488 loff_t pos = file_pos_read(file); 475 ret = vfs_write(f.file, buf, count, &pos);
489 ret = vfs_write(file, buf, count, &pos); 476 file_pos_write(f.file, pos);
490 file_pos_write(file, pos); 477 fdput(f);
491 fput_light(file, fput_needed);
492 } 478 }
493 479
494 return ret; 480 return ret;
@@ -497,19 +483,18 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
497SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, 483SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
498 size_t count, loff_t pos) 484 size_t count, loff_t pos)
499{ 485{
500 struct file *file; 486 struct fd f;
501 ssize_t ret = -EBADF; 487 ssize_t ret = -EBADF;
502 int fput_needed;
503 488
504 if (pos < 0) 489 if (pos < 0)
505 return -EINVAL; 490 return -EINVAL;
506 491
507 file = fget_light(fd, &fput_needed); 492 f = fdget(fd);
508 if (file) { 493 if (f.file) {
509 ret = -ESPIPE; 494 ret = -ESPIPE;
510 if (file->f_mode & FMODE_PREAD) 495 if (f.file->f_mode & FMODE_PREAD)
511 ret = vfs_read(file, buf, count, &pos); 496 ret = vfs_read(f.file, buf, count, &pos);
512 fput_light(file, fput_needed); 497 fdput(f);
513 } 498 }
514 499
515 return ret; 500 return ret;
@@ -526,19 +511,18 @@ SYSCALL_ALIAS(sys_pread64, SyS_pread64);
526SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, 511SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
527 size_t count, loff_t pos) 512 size_t count, loff_t pos)
528{ 513{
529 struct file *file; 514 struct fd f;
530 ssize_t ret = -EBADF; 515 ssize_t ret = -EBADF;
531 int fput_needed;
532 516
533 if (pos < 0) 517 if (pos < 0)
534 return -EINVAL; 518 return -EINVAL;
535 519
536 file = fget_light(fd, &fput_needed); 520 f = fdget(fd);
537 if (file) { 521 if (f.file) {
538 ret = -ESPIPE; 522 ret = -ESPIPE;
539 if (file->f_mode & FMODE_PWRITE) 523 if (f.file->f_mode & FMODE_PWRITE)
540 ret = vfs_write(file, buf, count, &pos); 524 ret = vfs_write(f.file, buf, count, &pos);
541 fput_light(file, fput_needed); 525 fdput(f);
542 } 526 }
543 527
544 return ret; 528 return ret;
@@ -789,16 +773,14 @@ EXPORT_SYMBOL(vfs_writev);
789SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 773SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
790 unsigned long, vlen) 774 unsigned long, vlen)
791{ 775{
792 struct file *file; 776 struct fd f = fdget(fd);
793 ssize_t ret = -EBADF; 777 ssize_t ret = -EBADF;
794 int fput_needed;
795 778
796 file = fget_light(fd, &fput_needed); 779 if (f.file) {
797 if (file) { 780 loff_t pos = file_pos_read(f.file);
798 loff_t pos = file_pos_read(file); 781 ret = vfs_readv(f.file, vec, vlen, &pos);
799 ret = vfs_readv(file, vec, vlen, &pos); 782 file_pos_write(f.file, pos);
800 file_pos_write(file, pos); 783 fdput(f);
801 fput_light(file, fput_needed);
802 } 784 }
803 785
804 if (ret > 0) 786 if (ret > 0)
@@ -810,16 +792,14 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
810SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 792SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
811 unsigned long, vlen) 793 unsigned long, vlen)
812{ 794{
813 struct file *file; 795 struct fd f = fdget(fd);
814 ssize_t ret = -EBADF; 796 ssize_t ret = -EBADF;
815 int fput_needed;
816 797
817 file = fget_light(fd, &fput_needed); 798 if (f.file) {
818 if (file) { 799 loff_t pos = file_pos_read(f.file);
819 loff_t pos = file_pos_read(file); 800 ret = vfs_writev(f.file, vec, vlen, &pos);
820 ret = vfs_writev(file, vec, vlen, &pos); 801 file_pos_write(f.file, pos);
821 file_pos_write(file, pos); 802 fdput(f);
822 fput_light(file, fput_needed);
823 } 803 }
824 804
825 if (ret > 0) 805 if (ret > 0)
@@ -838,19 +818,18 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
838 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 818 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
839{ 819{
840 loff_t pos = pos_from_hilo(pos_h, pos_l); 820 loff_t pos = pos_from_hilo(pos_h, pos_l);
841 struct file *file; 821 struct fd f;
842 ssize_t ret = -EBADF; 822 ssize_t ret = -EBADF;
843 int fput_needed;
844 823
845 if (pos < 0) 824 if (pos < 0)
846 return -EINVAL; 825 return -EINVAL;
847 826
848 file = fget_light(fd, &fput_needed); 827 f = fdget(fd);
849 if (file) { 828 if (f.file) {
850 ret = -ESPIPE; 829 ret = -ESPIPE;
851 if (file->f_mode & FMODE_PREAD) 830 if (f.file->f_mode & FMODE_PREAD)
852 ret = vfs_readv(file, vec, vlen, &pos); 831 ret = vfs_readv(f.file, vec, vlen, &pos);
853 fput_light(file, fput_needed); 832 fdput(f);
854 } 833 }
855 834
856 if (ret > 0) 835 if (ret > 0)
@@ -863,19 +842,18 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
863 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 842 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
864{ 843{
865 loff_t pos = pos_from_hilo(pos_h, pos_l); 844 loff_t pos = pos_from_hilo(pos_h, pos_l);
866 struct file *file; 845 struct fd f;
867 ssize_t ret = -EBADF; 846 ssize_t ret = -EBADF;
868 int fput_needed;
869 847
870 if (pos < 0) 848 if (pos < 0)
871 return -EINVAL; 849 return -EINVAL;
872 850
873 file = fget_light(fd, &fput_needed); 851 f = fdget(fd);
874 if (file) { 852 if (f.file) {
875 ret = -ESPIPE; 853 ret = -ESPIPE;
876 if (file->f_mode & FMODE_PWRITE) 854 if (f.file->f_mode & FMODE_PWRITE)
877 ret = vfs_writev(file, vec, vlen, &pos); 855 ret = vfs_writev(f.file, vec, vlen, &pos);
878 fput_light(file, fput_needed); 856 fdput(f);
879 } 857 }
880 858
881 if (ret > 0) 859 if (ret > 0)
@@ -884,31 +862,31 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
884 return ret; 862 return ret;
885} 863}
886 864
887static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 865ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
888 size_t count, loff_t max) 866 loff_t max)
889{ 867{
890 struct file * in_file, * out_file; 868 struct fd in, out;
891 struct inode * in_inode, * out_inode; 869 struct inode *in_inode, *out_inode;
892 loff_t pos; 870 loff_t pos;
893 ssize_t retval; 871 ssize_t retval;
894 int fput_needed_in, fput_needed_out, fl; 872 int fl;
895 873
896 /* 874 /*
897 * Get input file, and verify that it is ok.. 875 * Get input file, and verify that it is ok..
898 */ 876 */
899 retval = -EBADF; 877 retval = -EBADF;
900 in_file = fget_light(in_fd, &fput_needed_in); 878 in = fdget(in_fd);
901 if (!in_file) 879 if (!in.file)
902 goto out; 880 goto out;
903 if (!(in_file->f_mode & FMODE_READ)) 881 if (!(in.file->f_mode & FMODE_READ))
904 goto fput_in; 882 goto fput_in;
905 retval = -ESPIPE; 883 retval = -ESPIPE;
906 if (!ppos) 884 if (!ppos)
907 ppos = &in_file->f_pos; 885 ppos = &in.file->f_pos;
908 else 886 else
909 if (!(in_file->f_mode & FMODE_PREAD)) 887 if (!(in.file->f_mode & FMODE_PREAD))
910 goto fput_in; 888 goto fput_in;
911 retval = rw_verify_area(READ, in_file, ppos, count); 889 retval = rw_verify_area(READ, in.file, ppos, count);
912 if (retval < 0) 890 if (retval < 0)
913 goto fput_in; 891 goto fput_in;
914 count = retval; 892 count = retval;
@@ -917,15 +895,15 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
917 * Get output file, and verify that it is ok.. 895 * Get output file, and verify that it is ok..
918 */ 896 */
919 retval = -EBADF; 897 retval = -EBADF;
920 out_file = fget_light(out_fd, &fput_needed_out); 898 out = fdget(out_fd);
921 if (!out_file) 899 if (!out.file)
922 goto fput_in; 900 goto fput_in;
923 if (!(out_file->f_mode & FMODE_WRITE)) 901 if (!(out.file->f_mode & FMODE_WRITE))
924 goto fput_out; 902 goto fput_out;
925 retval = -EINVAL; 903 retval = -EINVAL;
926 in_inode = in_file->f_path.dentry->d_inode; 904 in_inode = in.file->f_path.dentry->d_inode;
927 out_inode = out_file->f_path.dentry->d_inode; 905 out_inode = out.file->f_path.dentry->d_inode;
928 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 906 retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
929 if (retval < 0) 907 if (retval < 0)
930 goto fput_out; 908 goto fput_out;
931 count = retval; 909 count = retval;
@@ -949,10 +927,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
949 * and the application is arguably buggy if it doesn't expect 927 * and the application is arguably buggy if it doesn't expect
950 * EAGAIN on a non-blocking file descriptor. 928 * EAGAIN on a non-blocking file descriptor.
951 */ 929 */
952 if (in_file->f_flags & O_NONBLOCK) 930 if (in.file->f_flags & O_NONBLOCK)
953 fl = SPLICE_F_NONBLOCK; 931 fl = SPLICE_F_NONBLOCK;
954#endif 932#endif
955 retval = do_splice_direct(in_file, ppos, out_file, count, fl); 933 retval = do_splice_direct(in.file, ppos, out.file, count, fl);
956 934
957 if (retval > 0) { 935 if (retval > 0) {
958 add_rchar(current, retval); 936 add_rchar(current, retval);
@@ -965,9 +943,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
965 retval = -EOVERFLOW; 943 retval = -EOVERFLOW;
966 944
967fput_out: 945fput_out:
968 fput_light(out_file, fput_needed_out); 946 fdput(out);
969fput_in: 947fput_in:
970 fput_light(in_file, fput_needed_in); 948 fdput(in);
971out: 949out:
972 return retval; 950 return retval;
973} 951}
diff --git a/fs/read_write.h b/fs/read_write.h
index d07b954c6e0c..d3e00ef67420 100644
--- a/fs/read_write.h
+++ b/fs/read_write.h
@@ -12,3 +12,5 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
12 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); 12 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
13ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, 13ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
14 unsigned long nr_segs, loff_t *ppos, io_fn_t fn); 14 unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
15ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
16 loff_t max);
diff --git a/fs/readdir.c b/fs/readdir.c
index 39e3370d79cf..5e69ef533b77 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -106,22 +106,20 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
106 struct old_linux_dirent __user *, dirent, unsigned int, count) 106 struct old_linux_dirent __user *, dirent, unsigned int, count)
107{ 107{
108 int error; 108 int error;
109 struct file * file; 109 struct fd f = fdget(fd);
110 struct readdir_callback buf; 110 struct readdir_callback buf;
111 int fput_needed;
112 111
113 file = fget_light(fd, &fput_needed); 112 if (!f.file)
114 if (!file)
115 return -EBADF; 113 return -EBADF;
116 114
117 buf.result = 0; 115 buf.result = 0;
118 buf.dirent = dirent; 116 buf.dirent = dirent;
119 117
120 error = vfs_readdir(file, fillonedir, &buf); 118 error = vfs_readdir(f.file, fillonedir, &buf);
121 if (buf.result) 119 if (buf.result)
122 error = buf.result; 120 error = buf.result;
123 121
124 fput_light(file, fput_needed); 122 fdput(f);
125 return error; 123 return error;
126} 124}
127 125
@@ -191,17 +189,16 @@ efault:
191SYSCALL_DEFINE3(getdents, unsigned int, fd, 189SYSCALL_DEFINE3(getdents, unsigned int, fd,
192 struct linux_dirent __user *, dirent, unsigned int, count) 190 struct linux_dirent __user *, dirent, unsigned int, count)
193{ 191{
194 struct file * file; 192 struct fd f;
195 struct linux_dirent __user * lastdirent; 193 struct linux_dirent __user * lastdirent;
196 struct getdents_callback buf; 194 struct getdents_callback buf;
197 int fput_needed;
198 int error; 195 int error;
199 196
200 if (!access_ok(VERIFY_WRITE, dirent, count)) 197 if (!access_ok(VERIFY_WRITE, dirent, count))
201 return -EFAULT; 198 return -EFAULT;
202 199
203 file = fget_light(fd, &fput_needed); 200 f = fdget(fd);
204 if (!file) 201 if (!f.file)
205 return -EBADF; 202 return -EBADF;
206 203
207 buf.current_dir = dirent; 204 buf.current_dir = dirent;
@@ -209,17 +206,17 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
209 buf.count = count; 206 buf.count = count;
210 buf.error = 0; 207 buf.error = 0;
211 208
212 error = vfs_readdir(file, filldir, &buf); 209 error = vfs_readdir(f.file, filldir, &buf);
213 if (error >= 0) 210 if (error >= 0)
214 error = buf.error; 211 error = buf.error;
215 lastdirent = buf.previous; 212 lastdirent = buf.previous;
216 if (lastdirent) { 213 if (lastdirent) {
217 if (put_user(file->f_pos, &lastdirent->d_off)) 214 if (put_user(f.file->f_pos, &lastdirent->d_off))
218 error = -EFAULT; 215 error = -EFAULT;
219 else 216 else
220 error = count - buf.count; 217 error = count - buf.count;
221 } 218 }
222 fput_light(file, fput_needed); 219 fdput(f);
223 return error; 220 return error;
224} 221}
225 222
@@ -272,17 +269,16 @@ efault:
272SYSCALL_DEFINE3(getdents64, unsigned int, fd, 269SYSCALL_DEFINE3(getdents64, unsigned int, fd,
273 struct linux_dirent64 __user *, dirent, unsigned int, count) 270 struct linux_dirent64 __user *, dirent, unsigned int, count)
274{ 271{
275 struct file * file; 272 struct fd f;
276 struct linux_dirent64 __user * lastdirent; 273 struct linux_dirent64 __user * lastdirent;
277 struct getdents_callback64 buf; 274 struct getdents_callback64 buf;
278 int fput_needed;
279 int error; 275 int error;
280 276
281 if (!access_ok(VERIFY_WRITE, dirent, count)) 277 if (!access_ok(VERIFY_WRITE, dirent, count))
282 return -EFAULT; 278 return -EFAULT;
283 279
284 file = fget_light(fd, &fput_needed); 280 f = fdget(fd);
285 if (!file) 281 if (!f.file)
286 return -EBADF; 282 return -EBADF;
287 283
288 buf.current_dir = dirent; 284 buf.current_dir = dirent;
@@ -290,17 +286,17 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
290 buf.count = count; 286 buf.count = count;
291 buf.error = 0; 287 buf.error = 0;
292 288
293 error = vfs_readdir(file, filldir64, &buf); 289 error = vfs_readdir(f.file, filldir64, &buf);
294 if (error >= 0) 290 if (error >= 0)
295 error = buf.error; 291 error = buf.error;
296 lastdirent = buf.previous; 292 lastdirent = buf.previous;
297 if (lastdirent) { 293 if (lastdirent) {
298 typeof(lastdirent->d_off) d_off = file->f_pos; 294 typeof(lastdirent->d_off) d_off = f.file->f_pos;
299 if (__put_user(d_off, &lastdirent->d_off)) 295 if (__put_user(d_off, &lastdirent->d_off))
300 error = -EFAULT; 296 error = -EFAULT;
301 else 297 else
302 error = count - buf.count; 298 error = count - buf.count;
303 } 299 }
304 fput_light(file, fput_needed); 300 fdput(f);
305 return error; 301 return error;
306} 302}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 855da58db145..f27f01a98aa2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1155,8 +1155,8 @@ static void init_inode(struct inode *inode, struct treepath *path)
1155 set_inode_sd_version(inode, STAT_DATA_V1); 1155 set_inode_sd_version(inode, STAT_DATA_V1);
1156 inode->i_mode = sd_v1_mode(sd); 1156 inode->i_mode = sd_v1_mode(sd);
1157 set_nlink(inode, sd_v1_nlink(sd)); 1157 set_nlink(inode, sd_v1_nlink(sd));
1158 inode->i_uid = sd_v1_uid(sd); 1158 i_uid_write(inode, sd_v1_uid(sd));
1159 inode->i_gid = sd_v1_gid(sd); 1159 i_gid_write(inode, sd_v1_gid(sd));
1160 inode->i_size = sd_v1_size(sd); 1160 inode->i_size = sd_v1_size(sd);
1161 inode->i_atime.tv_sec = sd_v1_atime(sd); 1161 inode->i_atime.tv_sec = sd_v1_atime(sd);
1162 inode->i_mtime.tv_sec = sd_v1_mtime(sd); 1162 inode->i_mtime.tv_sec = sd_v1_mtime(sd);
@@ -1200,9 +1200,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
1200 1200
1201 inode->i_mode = sd_v2_mode(sd); 1201 inode->i_mode = sd_v2_mode(sd);
1202 set_nlink(inode, sd_v2_nlink(sd)); 1202 set_nlink(inode, sd_v2_nlink(sd));
1203 inode->i_uid = sd_v2_uid(sd); 1203 i_uid_write(inode, sd_v2_uid(sd));
1204 inode->i_size = sd_v2_size(sd); 1204 inode->i_size = sd_v2_size(sd);
1205 inode->i_gid = sd_v2_gid(sd); 1205 i_gid_write(inode, sd_v2_gid(sd));
1206 inode->i_mtime.tv_sec = sd_v2_mtime(sd); 1206 inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1207 inode->i_atime.tv_sec = sd_v2_atime(sd); 1207 inode->i_atime.tv_sec = sd_v2_atime(sd);
1208 inode->i_ctime.tv_sec = sd_v2_ctime(sd); 1208 inode->i_ctime.tv_sec = sd_v2_ctime(sd);
@@ -1258,9 +1258,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
1258 1258
1259 set_sd_v2_mode(sd_v2, inode->i_mode); 1259 set_sd_v2_mode(sd_v2, inode->i_mode);
1260 set_sd_v2_nlink(sd_v2, inode->i_nlink); 1260 set_sd_v2_nlink(sd_v2, inode->i_nlink);
1261 set_sd_v2_uid(sd_v2, inode->i_uid); 1261 set_sd_v2_uid(sd_v2, i_uid_read(inode));
1262 set_sd_v2_size(sd_v2, size); 1262 set_sd_v2_size(sd_v2, size);
1263 set_sd_v2_gid(sd_v2, inode->i_gid); 1263 set_sd_v2_gid(sd_v2, i_gid_read(inode));
1264 set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); 1264 set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1265 set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); 1265 set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1266 set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); 1266 set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
@@ -1280,8 +1280,8 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1280 struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; 1280 struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1281 1281
1282 set_sd_v1_mode(sd_v1, inode->i_mode); 1282 set_sd_v1_mode(sd_v1, inode->i_mode);
1283 set_sd_v1_uid(sd_v1, inode->i_uid); 1283 set_sd_v1_uid(sd_v1, i_uid_read(inode));
1284 set_sd_v1_gid(sd_v1, inode->i_gid); 1284 set_sd_v1_gid(sd_v1, i_gid_read(inode));
1285 set_sd_v1_nlink(sd_v1, inode->i_nlink); 1285 set_sd_v1_nlink(sd_v1, inode->i_nlink);
1286 set_sd_v1_size(sd_v1, size); 1286 set_sd_v1_size(sd_v1, size);
1287 set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); 1287 set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
@@ -1573,8 +1573,10 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1573 reiserfs_warning(sb, "reiserfs-13077", 1573 reiserfs_warning(sb, "reiserfs-13077",
1574 "nfsd/reiserfs, fhtype=%d, len=%d - odd", 1574 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1575 fh_type, fh_len); 1575 fh_type, fh_len);
1576 fh_type = 5; 1576 fh_type = fh_len;
1577 } 1577 }
1578 if (fh_len < 2)
1579 return NULL;
1578 1580
1579 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], 1581 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1580 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); 1582 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
@@ -1583,6 +1585,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1583struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, 1585struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1584 int fh_len, int fh_type) 1586 int fh_len, int fh_type)
1585{ 1587{
1588 if (fh_type > fh_len)
1589 fh_type = fh_len;
1586 if (fh_type < 4) 1590 if (fh_type < 4)
1587 return NULL; 1591 return NULL;
1588 1592
@@ -1869,7 +1873,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1869 goto out_bad_inode; 1873 goto out_bad_inode;
1870 } 1874 }
1871 if (old_format_only(sb)) { 1875 if (old_format_only(sb)) {
1872 if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { 1876 if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
1873 pathrelse(&path_to_key); 1877 pathrelse(&path_to_key);
1874 /* i_uid or i_gid is too big to be stored in stat data v3.5 */ 1878 /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1875 err = -EINVAL; 1879 err = -EINVAL;
@@ -3140,16 +3144,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3140 } 3144 }
3141 } 3145 }
3142 3146
3143 if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || 3147 if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
3144 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && 3148 ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
3145 (get_inode_sd_version(inode) == STAT_DATA_V1)) { 3149 (get_inode_sd_version(inode) == STAT_DATA_V1)) {
3146 /* stat data of format v3.5 has 16 bit uid and gid */ 3150 /* stat data of format v3.5 has 16 bit uid and gid */
3147 error = -EINVAL; 3151 error = -EINVAL;
3148 goto out; 3152 goto out;
3149 } 3153 }
3150 3154
3151 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3155 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
3152 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3156 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
3153 struct reiserfs_transaction_handle th; 3157 struct reiserfs_transaction_handle th;
3154 int jbegin_count = 3158 int jbegin_count =
3155 2 * 3159 2 *
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7a37dabf5a96..1078ae179993 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -608,6 +608,11 @@ static int init_inodecache(void)
608 608
609static void destroy_inodecache(void) 609static void destroy_inodecache(void)
610{ 610{
611 /*
612 * Make sure all delayed rcu free inodes are flushed before we
613 * destroy cache.
614 */
615 rcu_barrier();
611 kmem_cache_destroy(reiserfs_inode_cachep); 616 kmem_cache_destroy(reiserfs_inode_cachep);
612} 617}
613 618
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d319963aeb11..c196369fe408 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -896,7 +896,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
896#endif 896#endif
897 897
898/* Actual operations that are exported to VFS-land */ 898/* Actual operations that are exported to VFS-land */
899const struct xattr_handler *reiserfs_xattr_handlers[] = { 899static const struct xattr_handler *reiserfs_xattr_handlers[] = {
900#ifdef CONFIG_REISERFS_FS_XATTR 900#ifdef CONFIG_REISERFS_FS_XATTR
901 &reiserfs_xattr_user_handler, 901 &reiserfs_xattr_user_handler,
902 &reiserfs_xattr_trusted_handler, 902 &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 44474f9b990d..d7c01ef64eda 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -30,7 +30,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
30 return -EPERM; 30 return -EPERM;
31 31
32 if (value) { 32 if (value) {
33 acl = posix_acl_from_xattr(value, size); 33 acl = posix_acl_from_xattr(&init_user_ns, value, size);
34 if (IS_ERR(acl)) { 34 if (IS_ERR(acl)) {
35 return PTR_ERR(acl); 35 return PTR_ERR(acl);
36 } else if (acl) { 36 } else if (acl) {
@@ -77,7 +77,7 @@ posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
77 return PTR_ERR(acl); 77 return PTR_ERR(acl);
78 if (acl == NULL) 78 if (acl == NULL)
79 return -ENODATA; 79 return -ENODATA;
80 error = posix_acl_to_xattr(acl, buffer, size); 80 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
81 posix_acl_release(acl); 81 posix_acl_release(acl);
82 82
83 return error; 83 return error;
@@ -121,15 +121,23 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
121 case ACL_OTHER: 121 case ACL_OTHER:
122 value = (char *)value + 122 value = (char *)value +
123 sizeof(reiserfs_acl_entry_short); 123 sizeof(reiserfs_acl_entry_short);
124 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
125 break; 124 break;
126 125
127 case ACL_USER: 126 case ACL_USER:
127 value = (char *)value + sizeof(reiserfs_acl_entry);
128 if ((char *)value > end)
129 goto fail;
130 acl->a_entries[n].e_uid =
131 make_kuid(&init_user_ns,
132 le32_to_cpu(entry->e_id));
133 break;
128 case ACL_GROUP: 134 case ACL_GROUP:
129 value = (char *)value + sizeof(reiserfs_acl_entry); 135 value = (char *)value + sizeof(reiserfs_acl_entry);
130 if ((char *)value > end) 136 if ((char *)value > end)
131 goto fail; 137 goto fail;
132 acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); 138 acl->a_entries[n].e_gid =
139 make_kgid(&init_user_ns,
140 le32_to_cpu(entry->e_id));
133 break; 141 break;
134 142
135 default: 143 default:
@@ -164,13 +172,19 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
164 ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); 172 ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
165 e = (char *)ext_acl + sizeof(reiserfs_acl_header); 173 e = (char *)ext_acl + sizeof(reiserfs_acl_header);
166 for (n = 0; n < acl->a_count; n++) { 174 for (n = 0; n < acl->a_count; n++) {
175 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
167 reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; 176 reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
168 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); 177 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
169 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); 178 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
170 switch (acl->a_entries[n].e_tag) { 179 switch (acl->a_entries[n].e_tag) {
171 case ACL_USER: 180 case ACL_USER:
181 entry->e_id = cpu_to_le32(
182 from_kuid(&init_user_ns, acl_e->e_uid));
183 e += sizeof(reiserfs_acl_entry);
184 break;
172 case ACL_GROUP: 185 case ACL_GROUP:
173 entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); 186 entry->e_id = cpu_to_le32(
187 from_kgid(&init_user_ns, acl_e->e_gid));
174 e += sizeof(reiserfs_acl_entry); 188 e += sizeof(reiserfs_acl_entry);
175 break; 189 break;
176 190
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 77c5f2173983..fd7c5f60b46b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -648,6 +648,11 @@ error_register:
648static void __exit exit_romfs_fs(void) 648static void __exit exit_romfs_fs(void)
649{ 649{
650 unregister_filesystem(&romfs_fs_type); 650 unregister_filesystem(&romfs_fs_type);
651 /*
652 * Make sure all delayed rcu free inodes are flushed before we
653 * destroy cache.
654 */
655 rcu_barrier();
651 kmem_cache_destroy(romfs_inode_cachep); 656 kmem_cache_destroy(romfs_inode_cachep);
652} 657}
653 658
diff --git a/fs/select.c b/fs/select.c
index db14c781335e..2ef72d965036 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -220,8 +220,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
220 struct poll_table_entry *entry = poll_get_entry(pwq); 220 struct poll_table_entry *entry = poll_get_entry(pwq);
221 if (!entry) 221 if (!entry)
222 return; 222 return;
223 get_file(filp); 223 entry->filp = get_file(filp);
224 entry->filp = filp;
225 entry->wait_address = wait_address; 224 entry->wait_address = wait_address;
226 entry->key = p->_key; 225 entry->key = p->_key;
227 init_waitqueue_func_entry(&entry->wait, pollwake); 226 init_waitqueue_func_entry(&entry->wait, pollwake);
@@ -429,8 +428,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
429 for (i = 0; i < n; ++rinp, ++routp, ++rexp) { 428 for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
430 unsigned long in, out, ex, all_bits, bit = 1, mask, j; 429 unsigned long in, out, ex, all_bits, bit = 1, mask, j;
431 unsigned long res_in = 0, res_out = 0, res_ex = 0; 430 unsigned long res_in = 0, res_out = 0, res_ex = 0;
432 const struct file_operations *f_op = NULL;
433 struct file *file = NULL;
434 431
435 in = *inp++; out = *outp++; ex = *exp++; 432 in = *inp++; out = *outp++; ex = *exp++;
436 all_bits = in | out | ex; 433 all_bits = in | out | ex;
@@ -440,20 +437,21 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
440 } 437 }
441 438
442 for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { 439 for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
443 int fput_needed; 440 struct fd f;
444 if (i >= n) 441 if (i >= n)
445 break; 442 break;
446 if (!(bit & all_bits)) 443 if (!(bit & all_bits))
447 continue; 444 continue;
448 file = fget_light(i, &fput_needed); 445 f = fdget(i);
449 if (file) { 446 if (f.file) {
450 f_op = file->f_op; 447 const struct file_operations *f_op;
448 f_op = f.file->f_op;
451 mask = DEFAULT_POLLMASK; 449 mask = DEFAULT_POLLMASK;
452 if (f_op && f_op->poll) { 450 if (f_op && f_op->poll) {
453 wait_key_set(wait, in, out, bit); 451 wait_key_set(wait, in, out, bit);
454 mask = (*f_op->poll)(file, wait); 452 mask = (*f_op->poll)(f.file, wait);
455 } 453 }
456 fput_light(file, fput_needed); 454 fdput(f);
457 if ((mask & POLLIN_SET) && (in & bit)) { 455 if ((mask & POLLIN_SET) && (in & bit)) {
458 res_in |= bit; 456 res_in |= bit;
459 retval++; 457 retval++;
@@ -726,20 +724,17 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
726 mask = 0; 724 mask = 0;
727 fd = pollfd->fd; 725 fd = pollfd->fd;
728 if (fd >= 0) { 726 if (fd >= 0) {
729 int fput_needed; 727 struct fd f = fdget(fd);
730 struct file * file;
731
732 file = fget_light(fd, &fput_needed);
733 mask = POLLNVAL; 728 mask = POLLNVAL;
734 if (file != NULL) { 729 if (f.file) {
735 mask = DEFAULT_POLLMASK; 730 mask = DEFAULT_POLLMASK;
736 if (file->f_op && file->f_op->poll) { 731 if (f.file->f_op && f.file->f_op->poll) {
737 pwait->_key = pollfd->events|POLLERR|POLLHUP; 732 pwait->_key = pollfd->events|POLLERR|POLLHUP;
738 mask = file->f_op->poll(file, pwait); 733 mask = f.file->f_op->poll(f.file, pwait);
739 } 734 }
740 /* Mask out unneeded events. */ 735 /* Mask out unneeded events. */
741 mask &= pollfd->events | POLLERR | POLLHUP; 736 mask &= pollfd->events | POLLERR | POLLHUP;
742 fput_light(file, fput_needed); 737 fdput(f);
743 } 738 }
744 } 739 }
745 pollfd->revents = mask; 740 pollfd->revents = mask;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 14cf9de1dbe1..99dffab4c4e4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -9,6 +9,7 @@
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/cred.h>
12 13
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
14#include <asm/page.h> 15#include <asm/page.h>
@@ -56,6 +57,9 @@ int seq_open(struct file *file, const struct seq_operations *op)
56 memset(p, 0, sizeof(*p)); 57 memset(p, 0, sizeof(*p));
57 mutex_init(&p->lock); 58 mutex_init(&p->lock);
58 p->op = op; 59 p->op = op;
60#ifdef CONFIG_USER_NS
61 p->user_ns = file->f_cred->user_ns;
62#endif
59 63
60 /* 64 /*
61 * Wrappers around seq_open(e.g. swaps_open) need to be 65 * Wrappers around seq_open(e.g. swaps_open) need to be
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9f35a37173de..8bee4e570911 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -269,13 +269,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
269 if (ufd < 0) 269 if (ufd < 0)
270 kfree(ctx); 270 kfree(ctx);
271 } else { 271 } else {
272 int fput_needed; 272 struct fd f = fdget(ufd);
273 struct file *file = fget_light(ufd, &fput_needed); 273 if (!f.file)
274 if (!file)
275 return -EBADF; 274 return -EBADF;
276 ctx = file->private_data; 275 ctx = f.file->private_data;
277 if (file->f_op != &signalfd_fops) { 276 if (f.file->f_op != &signalfd_fops) {
278 fput_light(file, fput_needed); 277 fdput(f);
279 return -EINVAL; 278 return -EINVAL;
280 } 279 }
281 spin_lock_irq(&current->sighand->siglock); 280 spin_lock_irq(&current->sighand->siglock);
@@ -283,7 +282,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
283 spin_unlock_irq(&current->sighand->siglock); 282 spin_unlock_irq(&current->sighand->siglock);
284 283
285 wake_up(&current->sighand->signalfd_wqh); 284 wake_up(&current->sighand->signalfd_wqh);
286 fput_light(file, fput_needed); 285 fdput(f);
287 } 286 }
288 287
289 return ufd; 288 return ufd;
diff --git a/fs/splice.c b/fs/splice.c
index 41514dd89462..13e5b4776e7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1666,9 +1666,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1666SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, 1666SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1667 unsigned long, nr_segs, unsigned int, flags) 1667 unsigned long, nr_segs, unsigned int, flags)
1668{ 1668{
1669 struct file *file; 1669 struct fd f;
1670 long error; 1670 long error;
1671 int fput;
1672 1671
1673 if (unlikely(nr_segs > UIO_MAXIOV)) 1672 if (unlikely(nr_segs > UIO_MAXIOV))
1674 return -EINVAL; 1673 return -EINVAL;
@@ -1676,14 +1675,14 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1676 return 0; 1675 return 0;
1677 1676
1678 error = -EBADF; 1677 error = -EBADF;
1679 file = fget_light(fd, &fput); 1678 f = fdget(fd);
1680 if (file) { 1679 if (f.file) {
1681 if (file->f_mode & FMODE_WRITE) 1680 if (f.file->f_mode & FMODE_WRITE)
1682 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1681 error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1683 else if (file->f_mode & FMODE_READ) 1682 else if (f.file->f_mode & FMODE_READ)
1684 error = vmsplice_to_user(file, iov, nr_segs, flags); 1683 error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1685 1684
1686 fput_light(file, fput); 1685 fdput(f);
1687 } 1686 }
1688 1687
1689 return error; 1688 return error;
@@ -1693,30 +1692,27 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1693 int, fd_out, loff_t __user *, off_out, 1692 int, fd_out, loff_t __user *, off_out,
1694 size_t, len, unsigned int, flags) 1693 size_t, len, unsigned int, flags)
1695{ 1694{
1695 struct fd in, out;
1696 long error; 1696 long error;
1697 struct file *in, *out;
1698 int fput_in, fput_out;
1699 1697
1700 if (unlikely(!len)) 1698 if (unlikely(!len))
1701 return 0; 1699 return 0;
1702 1700
1703 error = -EBADF; 1701 error = -EBADF;
1704 in = fget_light(fd_in, &fput_in); 1702 in = fdget(fd_in);
1705 if (in) { 1703 if (in.file) {
1706 if (in->f_mode & FMODE_READ) { 1704 if (in.file->f_mode & FMODE_READ) {
1707 out = fget_light(fd_out, &fput_out); 1705 out = fdget(fd_out);
1708 if (out) { 1706 if (out.file) {
1709 if (out->f_mode & FMODE_WRITE) 1707 if (out.file->f_mode & FMODE_WRITE)
1710 error = do_splice(in, off_in, 1708 error = do_splice(in.file, off_in,
1711 out, off_out, 1709 out.file, off_out,
1712 len, flags); 1710 len, flags);
1713 fput_light(out, fput_out); 1711 fdput(out);
1714 } 1712 }
1715 } 1713 }
1716 1714 fdput(in);
1717 fput_light(in, fput_in);
1718 } 1715 }
1719
1720 return error; 1716 return error;
1721} 1717}
1722 1718
@@ -2027,26 +2023,25 @@ static long do_tee(struct file *in, struct file *out, size_t len,
2027 2023
2028SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 2024SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2029{ 2025{
2030 struct file *in; 2026 struct fd in;
2031 int error, fput_in; 2027 int error;
2032 2028
2033 if (unlikely(!len)) 2029 if (unlikely(!len))
2034 return 0; 2030 return 0;
2035 2031
2036 error = -EBADF; 2032 error = -EBADF;
2037 in = fget_light(fdin, &fput_in); 2033 in = fdget(fdin);
2038 if (in) { 2034 if (in.file) {
2039 if (in->f_mode & FMODE_READ) { 2035 if (in.file->f_mode & FMODE_READ) {
2040 int fput_out; 2036 struct fd out = fdget(fdout);
2041 struct file *out = fget_light(fdout, &fput_out); 2037 if (out.file) {
2042 2038 if (out.file->f_mode & FMODE_WRITE)
2043 if (out) { 2039 error = do_tee(in.file, out.file,
2044 if (out->f_mode & FMODE_WRITE) 2040 len, flags);
2045 error = do_tee(in, out, len, flags); 2041 fdput(out);
2046 fput_light(out, fput_out);
2047 } 2042 }
2048 } 2043 }
2049 fput_light(in, fput_in); 2044 fdput(in);
2050 } 2045 }
2051 2046
2052 return error; 2047 return error;
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 81afbccfa843..a1ce5ce60632 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -56,16 +56,20 @@
56static int squashfs_new_inode(struct super_block *sb, struct inode *inode, 56static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
57 struct squashfs_base_inode *sqsh_ino) 57 struct squashfs_base_inode *sqsh_ino)
58{ 58{
59 uid_t i_uid;
60 gid_t i_gid;
59 int err; 61 int err;
60 62
61 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); 63 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
62 if (err) 64 if (err)
63 return err; 65 return err;
64 66
65 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); 67 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid);
66 if (err) 68 if (err)
67 return err; 69 return err;
68 70
71 i_uid_write(inode, i_uid);
72 i_gid_write(inode, i_gid);
69 inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); 73 inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
70 inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); 74 inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
71 inode->i_atime.tv_sec = inode->i_mtime.tv_sec; 75 inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 29cd014ed3a1..260e3928d4f5 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -425,6 +425,11 @@ static int __init init_inodecache(void)
425 425
426static void destroy_inodecache(void) 426static void destroy_inodecache(void)
427{ 427{
428 /*
429 * Make sure all delayed rcu free inodes are flushed before we
430 * destroy cache.
431 */
432 rcu_barrier();
428 kmem_cache_destroy(squashfs_inode_cachep); 433 kmem_cache_destroy(squashfs_inode_cachep);
429} 434}
430 435
diff --git a/fs/stat.c b/fs/stat.c
index b6ff11825fc8..eae494630a36 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
57 57
58int vfs_fstat(unsigned int fd, struct kstat *stat) 58int vfs_fstat(unsigned int fd, struct kstat *stat)
59{ 59{
60 int fput_needed; 60 struct fd f = fdget_raw(fd);
61 struct file *f = fget_light(fd, &fput_needed);
62 int error = -EBADF; 61 int error = -EBADF;
63 62
64 if (f) { 63 if (f.file) {
65 error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); 64 error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry,
66 fput_light(f, fput_needed); 65 stat);
66 fdput(f);
67 } 67 }
68 return error; 68 return error;
69} 69}
@@ -326,7 +326,7 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
326 326
327 327
328/* ---------- LFS-64 ----------- */ 328/* ---------- LFS-64 ----------- */
329#ifdef __ARCH_WANT_STAT64 329#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
330 330
331#ifndef INIT_STRUCT_STAT64_PADDING 331#ifndef INIT_STRUCT_STAT64_PADDING
332# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) 332# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
@@ -415,7 +415,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
415 return error; 415 return error;
416 return cp_new_stat64(&stat, statbuf); 416 return cp_new_stat64(&stat, statbuf);
417} 417}
418#endif /* __ARCH_WANT_STAT64 */ 418#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
419 419
420/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ 420/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
421void __inode_add_bytes(struct inode *inode, loff_t bytes) 421void __inode_add_bytes(struct inode *inode, loff_t bytes)
diff --git a/fs/statfs.c b/fs/statfs.c
index 95ad5c0e586c..f8e832e6f0a2 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -87,12 +87,11 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
87 87
88int fd_statfs(int fd, struct kstatfs *st) 88int fd_statfs(int fd, struct kstatfs *st)
89{ 89{
90 int fput_needed; 90 struct fd f = fdget(fd);
91 struct file *file = fget_light(fd, &fput_needed);
92 int error = -EBADF; 91 int error = -EBADF;
93 if (file) { 92 if (f.file) {
94 error = vfs_statfs(&file->f_path, st); 93 error = vfs_statfs(&f.file->f_path, st);
95 fput_light(file, fput_needed); 94 fdput(f);
96 } 95 }
97 return error; 96 return error;
98} 97}
diff --git a/fs/super.c b/fs/super.c
index 0902cfa6a12e..12f123712161 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -186,15 +186,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
186 spin_lock_init(&s->s_inode_lru_lock); 186 spin_lock_init(&s->s_inode_lru_lock);
187 INIT_LIST_HEAD(&s->s_mounts); 187 INIT_LIST_HEAD(&s->s_mounts);
188 init_rwsem(&s->s_umount); 188 init_rwsem(&s->s_umount);
189 mutex_init(&s->s_lock);
190 lockdep_set_class(&s->s_umount, &type->s_umount_key); 189 lockdep_set_class(&s->s_umount, &type->s_umount_key);
191 /* 190 /*
192 * The locking rules for s_lock are up to the
193 * filesystem. For example ext3fs has different
194 * lock ordering than usbfs:
195 */
196 lockdep_set_class(&s->s_lock, &type->s_lock_key);
197 /*
198 * sget() can have s_umount recursion. 191 * sget() can have s_umount recursion.
199 * 192 *
200 * When it cannot find a suitable sb, it allocates a new 193 * When it cannot find a suitable sb, it allocates a new
@@ -307,12 +300,6 @@ void deactivate_locked_super(struct super_block *s)
307 300
308 /* caches are now gone, we can safely kill the shrinker now */ 301 /* caches are now gone, we can safely kill the shrinker now */
309 unregister_shrinker(&s->s_shrink); 302 unregister_shrinker(&s->s_shrink);
310
311 /*
312 * We need to call rcu_barrier so all the delayed rcu free
313 * inodes are flushed before we release the fs module.
314 */
315 rcu_barrier();
316 put_filesystem(fs); 303 put_filesystem(fs);
317 put_super(s); 304 put_super(s);
318 } else { 305 } else {
@@ -400,22 +387,6 @@ bool grab_super_passive(struct super_block *sb)
400 return false; 387 return false;
401} 388}
402 389
403/*
404 * Superblock locking. We really ought to get rid of these two.
405 */
406void lock_super(struct super_block * sb)
407{
408 mutex_lock(&sb->s_lock);
409}
410
411void unlock_super(struct super_block * sb)
412{
413 mutex_unlock(&sb->s_lock);
414}
415
416EXPORT_SYMBOL(lock_super);
417EXPORT_SYMBOL(unlock_super);
418
419/** 390/**
420 * generic_shutdown_super - common helper for ->kill_sb() 391 * generic_shutdown_super - common helper for ->kill_sb()
421 * @sb: superblock to kill 392 * @sb: superblock to kill
@@ -871,7 +842,7 @@ int get_anon_bdev(dev_t *p)
871 else if (error) 842 else if (error)
872 return -EAGAIN; 843 return -EAGAIN;
873 844
874 if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { 845 if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
875 spin_lock(&unnamed_dev_lock); 846 spin_lock(&unnamed_dev_lock);
876 ida_remove(&unnamed_dev_ida, dev); 847 ida_remove(&unnamed_dev_ida, dev);
877 if (unnamed_dev_start > dev) 848 if (unnamed_dev_start > dev)
diff --git a/fs/sync.c b/fs/sync.c
index eb8722dc556f..14eefeb44636 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -148,21 +148,19 @@ void emergency_sync(void)
148 */ 148 */
149SYSCALL_DEFINE1(syncfs, int, fd) 149SYSCALL_DEFINE1(syncfs, int, fd)
150{ 150{
151 struct file *file; 151 struct fd f = fdget(fd);
152 struct super_block *sb; 152 struct super_block *sb;
153 int ret; 153 int ret;
154 int fput_needed;
155 154
156 file = fget_light(fd, &fput_needed); 155 if (!f.file)
157 if (!file)
158 return -EBADF; 156 return -EBADF;
159 sb = file->f_dentry->d_sb; 157 sb = f.file->f_dentry->d_sb;
160 158
161 down_read(&sb->s_umount); 159 down_read(&sb->s_umount);
162 ret = sync_filesystem(sb); 160 ret = sync_filesystem(sb);
163 up_read(&sb->s_umount); 161 up_read(&sb->s_umount);
164 162
165 fput_light(file, fput_needed); 163 fdput(f);
166 return ret; 164 return ret;
167} 165}
168 166
@@ -201,14 +199,12 @@ EXPORT_SYMBOL(vfs_fsync);
201 199
202static int do_fsync(unsigned int fd, int datasync) 200static int do_fsync(unsigned int fd, int datasync)
203{ 201{
204 struct file *file; 202 struct fd f = fdget(fd);
205 int ret = -EBADF; 203 int ret = -EBADF;
206 int fput_needed;
207 204
208 file = fget_light(fd, &fput_needed); 205 if (f.file) {
209 if (file) { 206 ret = vfs_fsync(f.file, datasync);
210 ret = vfs_fsync(file, datasync); 207 fdput(f);
211 fput_light(file, fput_needed);
212 } 208 }
213 return ret; 209 return ret;
214} 210}
@@ -291,10 +287,9 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
291 unsigned int flags) 287 unsigned int flags)
292{ 288{
293 int ret; 289 int ret;
294 struct file *file; 290 struct fd f;
295 struct address_space *mapping; 291 struct address_space *mapping;
296 loff_t endbyte; /* inclusive */ 292 loff_t endbyte; /* inclusive */
297 int fput_needed;
298 umode_t i_mode; 293 umode_t i_mode;
299 294
300 ret = -EINVAL; 295 ret = -EINVAL;
@@ -333,17 +328,17 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
333 endbyte--; /* inclusive */ 328 endbyte--; /* inclusive */
334 329
335 ret = -EBADF; 330 ret = -EBADF;
336 file = fget_light(fd, &fput_needed); 331 f = fdget(fd);
337 if (!file) 332 if (!f.file)
338 goto out; 333 goto out;
339 334
340 i_mode = file->f_path.dentry->d_inode->i_mode; 335 i_mode = f.file->f_path.dentry->d_inode->i_mode;
341 ret = -ESPIPE; 336 ret = -ESPIPE;
342 if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && 337 if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
343 !S_ISLNK(i_mode)) 338 !S_ISLNK(i_mode))
344 goto out_put; 339 goto out_put;
345 340
346 mapping = file->f_mapping; 341 mapping = f.file->f_mapping;
347 if (!mapping) { 342 if (!mapping) {
348 ret = -EINVAL; 343 ret = -EINVAL;
349 goto out_put; 344 goto out_put;
@@ -366,7 +361,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
366 ret = filemap_fdatawait_range(mapping, offset, endbyte); 361 ret = filemap_fdatawait_range(mapping, offset, endbyte);
367 362
368out_put: 363out_put:
369 fput_light(file, fput_needed); 364 fdput(f);
370out: 365out:
371 return ret; 366 return ret;
372} 367}
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a7ac78f8e67a..3c9eb5624f5e 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -113,7 +113,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target,
113 * @target: object we're pointing to. 113 * @target: object we're pointing to.
114 * @name: name of the symlink. 114 * @name: name of the symlink.
115 * 115 *
116 * This function does the same as sysf_create_link(), but it 116 * This function does the same as sysfs_create_link(), but it
117 * doesn't warn if the link already exists. 117 * doesn't warn if the link already exists.
118 */ 118 */
119int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, 119int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
index 9a6ad96acf27..921c053fc052 100644
--- a/fs/sysv/balloc.c
+++ b/fs/sysv/balloc.c
@@ -60,12 +60,12 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
60 return; 60 return;
61 } 61 }
62 62
63 lock_super(sb); 63 mutex_lock(&sbi->s_lock);
64 count = fs16_to_cpu(sbi, *sbi->s_bcache_count); 64 count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
65 65
66 if (count > sbi->s_flc_size) { 66 if (count > sbi->s_flc_size) {
67 printk("sysv_free_block: flc_count > flc_size\n"); 67 printk("sysv_free_block: flc_count > flc_size\n");
68 unlock_super(sb); 68 mutex_unlock(&sbi->s_lock);
69 return; 69 return;
70 } 70 }
71 /* If the free list head in super-block is full, it is copied 71 /* If the free list head in super-block is full, it is copied
@@ -77,7 +77,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
77 bh = sb_getblk(sb, block); 77 bh = sb_getblk(sb, block);
78 if (!bh) { 78 if (!bh) {
79 printk("sysv_free_block: getblk() failed\n"); 79 printk("sysv_free_block: getblk() failed\n");
80 unlock_super(sb); 80 mutex_unlock(&sbi->s_lock);
81 return; 81 return;
82 } 82 }
83 memset(bh->b_data, 0, sb->s_blocksize); 83 memset(bh->b_data, 0, sb->s_blocksize);
@@ -93,7 +93,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
93 *sbi->s_bcache_count = cpu_to_fs16(sbi, count); 93 *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
94 fs32_add(sbi, sbi->s_free_blocks, 1); 94 fs32_add(sbi, sbi->s_free_blocks, 1);
95 dirty_sb(sb); 95 dirty_sb(sb);
96 unlock_super(sb); 96 mutex_unlock(&sbi->s_lock);
97} 97}
98 98
99sysv_zone_t sysv_new_block(struct super_block * sb) 99sysv_zone_t sysv_new_block(struct super_block * sb)
@@ -104,7 +104,7 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
104 struct buffer_head * bh; 104 struct buffer_head * bh;
105 unsigned count; 105 unsigned count;
106 106
107 lock_super(sb); 107 mutex_lock(&sbi->s_lock);
108 count = fs16_to_cpu(sbi, *sbi->s_bcache_count); 108 count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
109 109
110 if (count == 0) /* Applies only to Coherent FS */ 110 if (count == 0) /* Applies only to Coherent FS */
@@ -147,11 +147,11 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
147 /* Now the free list head in the superblock is valid again. */ 147 /* Now the free list head in the superblock is valid again. */
148 fs32_add(sbi, sbi->s_free_blocks, -1); 148 fs32_add(sbi, sbi->s_free_blocks, -1);
149 dirty_sb(sb); 149 dirty_sb(sb);
150 unlock_super(sb); 150 mutex_unlock(&sbi->s_lock);
151 return nr; 151 return nr;
152 152
153Enospc: 153Enospc:
154 unlock_super(sb); 154 mutex_unlock(&sbi->s_lock);
155 return 0; 155 return 0;
156} 156}
157 157
@@ -173,7 +173,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
173 if (sbi->s_type == FSTYPE_AFS) 173 if (sbi->s_type == FSTYPE_AFS)
174 return 0; 174 return 0;
175 175
176 lock_super(sb); 176 mutex_lock(&sbi->s_lock);
177 sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); 177 sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
178 178
179 if (0) 179 if (0)
@@ -211,7 +211,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
211 if (count != sb_count) 211 if (count != sb_count)
212 goto Ecount; 212 goto Ecount;
213done: 213done:
214 unlock_super(sb); 214 mutex_unlock(&sbi->s_lock);
215 return count; 215 return count;
216 216
217Einval: 217Einval:
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 8233b02eccae..f9db4eb31db4 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -118,7 +118,7 @@ void sysv_free_inode(struct inode * inode)
118 "%s\n", inode->i_sb->s_id); 118 "%s\n", inode->i_sb->s_id);
119 return; 119 return;
120 } 120 }
121 lock_super(sb); 121 mutex_lock(&sbi->s_lock);
122 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); 122 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
123 if (count < sbi->s_fic_size) { 123 if (count < sbi->s_fic_size) {
124 *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); 124 *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
@@ -128,7 +128,7 @@ void sysv_free_inode(struct inode * inode)
128 dirty_sb(sb); 128 dirty_sb(sb);
129 memset(raw_inode, 0, sizeof(struct sysv_inode)); 129 memset(raw_inode, 0, sizeof(struct sysv_inode));
130 mark_buffer_dirty(bh); 130 mark_buffer_dirty(bh);
131 unlock_super(sb); 131 mutex_unlock(&sbi->s_lock);
132 brelse(bh); 132 brelse(bh);
133} 133}
134 134
@@ -147,13 +147,13 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
147 if (!inode) 147 if (!inode)
148 return ERR_PTR(-ENOMEM); 148 return ERR_PTR(-ENOMEM);
149 149
150 lock_super(sb); 150 mutex_lock(&sbi->s_lock);
151 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); 151 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
152 if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { 152 if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
153 count = refill_free_cache(sb); 153 count = refill_free_cache(sb);
154 if (count == 0) { 154 if (count == 0) {
155 iput(inode); 155 iput(inode);
156 unlock_super(sb); 156 mutex_unlock(&sbi->s_lock);
157 return ERR_PTR(-ENOSPC); 157 return ERR_PTR(-ENOSPC);
158 } 158 }
159 } 159 }
@@ -174,7 +174,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
174 sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ 174 sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
175 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 175 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
176 /* That's it. */ 176 /* That's it. */
177 unlock_super(sb); 177 mutex_unlock(&sbi->s_lock);
178 return inode; 178 return inode;
179} 179}
180 180
@@ -185,7 +185,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
185 struct sysv_inode * raw_inode; 185 struct sysv_inode * raw_inode;
186 int ino, count, sb_count; 186 int ino, count, sb_count;
187 187
188 lock_super(sb); 188 mutex_lock(&sbi->s_lock);
189 189
190 sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); 190 sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
191 191
@@ -213,7 +213,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
213 if (count != sb_count) 213 if (count != sb_count)
214 goto Einval; 214 goto Einval;
215out: 215out:
216 unlock_super(sb); 216 mutex_unlock(&sbi->s_lock);
217 return count; 217 return count;
218 218
219Einval: 219Einval:
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 80e1e2b18df1..c327d4ee1235 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -36,7 +36,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
36 struct sysv_sb_info *sbi = SYSV_SB(sb); 36 struct sysv_sb_info *sbi = SYSV_SB(sb);
37 unsigned long time = get_seconds(), old_time; 37 unsigned long time = get_seconds(), old_time;
38 38
39 lock_super(sb); 39 mutex_lock(&sbi->s_lock);
40 40
41 /* 41 /*
42 * If we are going to write out the super block, 42 * If we are going to write out the super block,
@@ -51,7 +51,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
51 mark_buffer_dirty(sbi->s_bh2); 51 mark_buffer_dirty(sbi->s_bh2);
52 } 52 }
53 53
54 unlock_super(sb); 54 mutex_unlock(&sbi->s_lock);
55 55
56 return 0; 56 return 0;
57} 57}
@@ -202,8 +202,8 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
202 } 202 }
203 /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ 203 /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
204 inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); 204 inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
205 inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); 205 i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid));
206 inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); 206 i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
207 set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); 207 set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
208 inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); 208 inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
209 inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); 209 inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
@@ -256,8 +256,8 @@ static int __sysv_write_inode(struct inode *inode, int wait)
256 } 256 }
257 257
258 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); 258 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
259 raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); 259 raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode)));
260 raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); 260 raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
261 raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); 261 raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
262 raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); 262 raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
263 raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); 263 raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
@@ -360,5 +360,10 @@ int __init sysv_init_icache(void)
360 360
361void sysv_destroy_icache(void) 361void sysv_destroy_icache(void)
362{ 362{
363 /*
364 * Make sure all delayed rcu free inodes are flushed before we
365 * destroy cache.
366 */
367 rcu_barrier();
363 kmem_cache_destroy(sysv_inode_cachep); 368 kmem_cache_destroy(sysv_inode_cachep);
364} 369}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 7491c33b6468..a38e87bdd78d 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -368,6 +368,7 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
368 368
369 sbi->s_sb = sb; 369 sbi->s_sb = sb;
370 sbi->s_block_base = 0; 370 sbi->s_block_base = 0;
371 mutex_init(&sbi->s_lock);
371 sb->s_fs_info = sbi; 372 sb->s_fs_info = sbi;
372 373
373 sb_set_blocksize(sb, BLOCK_SIZE); 374 sb_set_blocksize(sb, BLOCK_SIZE);
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 0bc35fdc58e2..69d488986cce 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -58,6 +58,7 @@ struct sysv_sb_info {
58 u32 s_nzones; /* same as s_sbd->s_fsize */ 58 u32 s_nzones; /* same as s_sbd->s_fsize */
59 u16 s_namelen; /* max length of dir entry */ 59 u16 s_namelen; /* max length of dir entry */
60 int s_forced_ro; 60 int s_forced_ro;
61 struct mutex s_lock;
61}; 62};
62 63
63/* 64/*
diff --git a/fs/timerfd.c b/fs/timerfd.c
index dffeb3795af1..d03822bbf190 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -234,19 +234,17 @@ static const struct file_operations timerfd_fops = {
234 .llseek = noop_llseek, 234 .llseek = noop_llseek,
235}; 235};
236 236
237static struct file *timerfd_fget(int fd) 237static int timerfd_fget(int fd, struct fd *p)
238{ 238{
239 struct file *file; 239 struct fd f = fdget(fd);
240 240 if (!f.file)
241 file = fget(fd); 241 return -EBADF;
242 if (!file) 242 if (f.file->f_op != &timerfd_fops) {
243 return ERR_PTR(-EBADF); 243 fdput(f);
244 if (file->f_op != &timerfd_fops) { 244 return -EINVAL;
245 fput(file);
246 return ERR_PTR(-EINVAL);
247 } 245 }
248 246 *p = f;
249 return file; 247 return 0;
250} 248}
251 249
252SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) 250SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
@@ -284,7 +282,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
284 const struct itimerspec __user *, utmr, 282 const struct itimerspec __user *, utmr,
285 struct itimerspec __user *, otmr) 283 struct itimerspec __user *, otmr)
286{ 284{
287 struct file *file; 285 struct fd f;
288 struct timerfd_ctx *ctx; 286 struct timerfd_ctx *ctx;
289 struct itimerspec ktmr, kotmr; 287 struct itimerspec ktmr, kotmr;
290 int ret; 288 int ret;
@@ -297,10 +295,10 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
297 !timespec_valid(&ktmr.it_interval)) 295 !timespec_valid(&ktmr.it_interval))
298 return -EINVAL; 296 return -EINVAL;
299 297
300 file = timerfd_fget(ufd); 298 ret = timerfd_fget(ufd, &f);
301 if (IS_ERR(file)) 299 if (ret)
302 return PTR_ERR(file); 300 return ret;
303 ctx = file->private_data; 301 ctx = f.file->private_data;
304 302
305 timerfd_setup_cancel(ctx, flags); 303 timerfd_setup_cancel(ctx, flags);
306 304
@@ -334,7 +332,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
334 ret = timerfd_setup(ctx, flags, &ktmr); 332 ret = timerfd_setup(ctx, flags, &ktmr);
335 333
336 spin_unlock_irq(&ctx->wqh.lock); 334 spin_unlock_irq(&ctx->wqh.lock);
337 fput(file); 335 fdput(f);
338 if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) 336 if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
339 return -EFAULT; 337 return -EFAULT;
340 338
@@ -343,14 +341,13 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
343 341
344SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) 342SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
345{ 343{
346 struct file *file; 344 struct fd f;
347 struct timerfd_ctx *ctx; 345 struct timerfd_ctx *ctx;
348 struct itimerspec kotmr; 346 struct itimerspec kotmr;
349 347 int ret = timerfd_fget(ufd, &f);
350 file = timerfd_fget(ufd); 348 if (ret)
351 if (IS_ERR(file)) 349 return ret;
352 return PTR_ERR(file); 350 ctx = f.file->private_data;
353 ctx = file->private_data;
354 351
355 spin_lock_irq(&ctx->wqh.lock); 352 spin_lock_irq(&ctx->wqh.lock);
356 if (ctx->expired && ctx->tintv.tv64) { 353 if (ctx->expired && ctx->tintv.tv64) {
@@ -362,7 +359,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
362 kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 359 kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
363 kotmr.it_interval = ktime_to_timespec(ctx->tintv); 360 kotmr.it_interval = ktime_to_timespec(ctx->tintv);
364 spin_unlock_irq(&ctx->wqh.lock); 361 spin_unlock_irq(&ctx->wqh.lock);
365 fput(file); 362 fdput(f);
366 363
367 return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; 364 return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
368} 365}
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index bc4f94b28706..e8e01d74dc05 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -272,8 +272,8 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
272 */ 272 */
273static int can_use_rp(struct ubifs_info *c) 273static int can_use_rp(struct ubifs_info *c)
274{ 274{
275 if (current_fsuid() == c->rp_uid || capable(CAP_SYS_RESOURCE) || 275 if (uid_eq(current_fsuid(), c->rp_uid) || capable(CAP_SYS_RESOURCE) ||
276 (c->rp_gid != 0 && in_group_p(c->rp_gid))) 276 (!gid_eq(c->rp_gid, GLOBAL_ROOT_GID) && in_group_p(c->rp_gid)))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -342,9 +342,8 @@ static int do_budget_space(struct ubifs_info *c)
342 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 342 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
343 c->lst.taken_empty_lebs; 343 c->lst.taken_empty_lebs;
344 if (unlikely(rsvd_idx_lebs > lebs)) { 344 if (unlikely(rsvd_idx_lebs > lebs)) {
345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " 345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d",
346 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs, 346 min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs);
347 rsvd_idx_lebs);
348 return -ENOSPC; 347 return -ENOSPC;
349 } 348 }
350 349
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 8eda717cb99b..ff8229340cd5 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -293,8 +293,8 @@ int ubifs_bg_thread(void *info)
293 int err; 293 int err;
294 struct ubifs_info *c = info; 294 struct ubifs_info *c = info;
295 295
296 dbg_msg("background thread \"%s\" started, PID %d", 296 ubifs_msg("background thread \"%s\" started, PID %d",
297 c->bgt_name, current->pid); 297 c->bgt_name, current->pid);
298 set_freezable(); 298 set_freezable();
299 299
300 while (1) { 300 while (1) {
@@ -328,7 +328,7 @@ int ubifs_bg_thread(void *info)
328 cond_resched(); 328 cond_resched();
329 } 329 }
330 330
331 dbg_msg("background thread \"%s\" stops", c->bgt_name); 331 ubifs_msg("background thread \"%s\" stops", c->bgt_name);
332 return 0; 332 return 0;
333} 333}
334 334
@@ -514,7 +514,7 @@ struct idx_node {
514 struct list_head list; 514 struct list_head list;
515 int iip; 515 int iip;
516 union ubifs_key upper_key; 516 union ubifs_key upper_key;
517 struct ubifs_idx_node idx __attribute__((aligned(8))); 517 struct ubifs_idx_node idx __aligned(8);
518}; 518};
519 519
520/** 520/**
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 11e4132f314a..2bfa0953335d 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -112,8 +112,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
112 if (compr->comp_mutex) 112 if (compr->comp_mutex)
113 mutex_unlock(compr->comp_mutex); 113 mutex_unlock(compr->comp_mutex);
114 if (unlikely(err)) { 114 if (unlikely(err)) {
115 ubifs_warn("cannot compress %d bytes, compressor %s, " 115 ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
116 "error %d, leave data uncompressed",
117 in_len, compr->name, err); 116 in_len, compr->name, err);
118 goto no_compr; 117 goto no_compr;
119 } 118 }
@@ -176,8 +175,8 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
176 if (compr->decomp_mutex) 175 if (compr->decomp_mutex)
177 mutex_unlock(compr->decomp_mutex); 176 mutex_unlock(compr->decomp_mutex);
178 if (err) 177 if (err)
179 ubifs_err("cannot decompress %d bytes, compressor %s, " 178 ubifs_err("cannot decompress %d bytes, compressor %s, error %d",
180 "error %d", in_len, compr->name, err); 179 in_len, compr->name, err);
181 180
182 return err; 181 return err;
183} 182}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index bb3167257aab..62911637e12f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -219,15 +219,15 @@ const char *dbg_jhead(int jhead)
219 219
220static void dump_ch(const struct ubifs_ch *ch) 220static void dump_ch(const struct ubifs_ch *ch)
221{ 221{
222 printk(KERN_ERR "\tmagic %#x\n", le32_to_cpu(ch->magic)); 222 pr_err("\tmagic %#x\n", le32_to_cpu(ch->magic));
223 printk(KERN_ERR "\tcrc %#x\n", le32_to_cpu(ch->crc)); 223 pr_err("\tcrc %#x\n", le32_to_cpu(ch->crc));
224 printk(KERN_ERR "\tnode_type %d (%s)\n", ch->node_type, 224 pr_err("\tnode_type %d (%s)\n", ch->node_type,
225 dbg_ntype(ch->node_type)); 225 dbg_ntype(ch->node_type));
226 printk(KERN_ERR "\tgroup_type %d (%s)\n", ch->group_type, 226 pr_err("\tgroup_type %d (%s)\n", ch->group_type,
227 dbg_gtype(ch->group_type)); 227 dbg_gtype(ch->group_type));
228 printk(KERN_ERR "\tsqnum %llu\n", 228 pr_err("\tsqnum %llu\n",
229 (unsigned long long)le64_to_cpu(ch->sqnum)); 229 (unsigned long long)le64_to_cpu(ch->sqnum));
230 printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len)); 230 pr_err("\tlen %u\n", le32_to_cpu(ch->len));
231} 231}
232 232
233void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) 233void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
@@ -238,43 +238,43 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
238 struct ubifs_dent_node *dent, *pdent = NULL; 238 struct ubifs_dent_node *dent, *pdent = NULL;
239 int count = 2; 239 int count = 2;
240 240
241 printk(KERN_ERR "Dump in-memory inode:"); 241 pr_err("Dump in-memory inode:");
242 printk(KERN_ERR "\tinode %lu\n", inode->i_ino); 242 pr_err("\tinode %lu\n", inode->i_ino);
243 printk(KERN_ERR "\tsize %llu\n", 243 pr_err("\tsize %llu\n",
244 (unsigned long long)i_size_read(inode)); 244 (unsigned long long)i_size_read(inode));
245 printk(KERN_ERR "\tnlink %u\n", inode->i_nlink); 245 pr_err("\tnlink %u\n", inode->i_nlink);
246 printk(KERN_ERR "\tuid %u\n", (unsigned int)inode->i_uid); 246 pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode));
247 printk(KERN_ERR "\tgid %u\n", (unsigned int)inode->i_gid); 247 pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode));
248 printk(KERN_ERR "\tatime %u.%u\n", 248 pr_err("\tatime %u.%u\n",
249 (unsigned int)inode->i_atime.tv_sec, 249 (unsigned int)inode->i_atime.tv_sec,
250 (unsigned int)inode->i_atime.tv_nsec); 250 (unsigned int)inode->i_atime.tv_nsec);
251 printk(KERN_ERR "\tmtime %u.%u\n", 251 pr_err("\tmtime %u.%u\n",
252 (unsigned int)inode->i_mtime.tv_sec, 252 (unsigned int)inode->i_mtime.tv_sec,
253 (unsigned int)inode->i_mtime.tv_nsec); 253 (unsigned int)inode->i_mtime.tv_nsec);
254 printk(KERN_ERR "\tctime %u.%u\n", 254 pr_err("\tctime %u.%u\n",
255 (unsigned int)inode->i_ctime.tv_sec, 255 (unsigned int)inode->i_ctime.tv_sec,
256 (unsigned int)inode->i_ctime.tv_nsec); 256 (unsigned int)inode->i_ctime.tv_nsec);
257 printk(KERN_ERR "\tcreat_sqnum %llu\n", ui->creat_sqnum); 257 pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum);
258 printk(KERN_ERR "\txattr_size %u\n", ui->xattr_size); 258 pr_err("\txattr_size %u\n", ui->xattr_size);
259 printk(KERN_ERR "\txattr_cnt %u\n", ui->xattr_cnt); 259 pr_err("\txattr_cnt %u\n", ui->xattr_cnt);
260 printk(KERN_ERR "\txattr_names %u\n", ui->xattr_names); 260 pr_err("\txattr_names %u\n", ui->xattr_names);
261 printk(KERN_ERR "\tdirty %u\n", ui->dirty); 261 pr_err("\tdirty %u\n", ui->dirty);
262 printk(KERN_ERR "\txattr %u\n", ui->xattr); 262 pr_err("\txattr %u\n", ui->xattr);
263 printk(KERN_ERR "\tbulk_read %u\n", ui->xattr); 263 pr_err("\tbulk_read %u\n", ui->xattr);
264 printk(KERN_ERR "\tsynced_i_size %llu\n", 264 pr_err("\tsynced_i_size %llu\n",
265 (unsigned long long)ui->synced_i_size); 265 (unsigned long long)ui->synced_i_size);
266 printk(KERN_ERR "\tui_size %llu\n", 266 pr_err("\tui_size %llu\n",
267 (unsigned long long)ui->ui_size); 267 (unsigned long long)ui->ui_size);
268 printk(KERN_ERR "\tflags %d\n", ui->flags); 268 pr_err("\tflags %d\n", ui->flags);
269 printk(KERN_ERR "\tcompr_type %d\n", ui->compr_type); 269 pr_err("\tcompr_type %d\n", ui->compr_type);
270 printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read); 270 pr_err("\tlast_page_read %lu\n", ui->last_page_read);
271 printk(KERN_ERR "\tread_in_a_row %lu\n", ui->read_in_a_row); 271 pr_err("\tread_in_a_row %lu\n", ui->read_in_a_row);
272 printk(KERN_ERR "\tdata_len %d\n", ui->data_len); 272 pr_err("\tdata_len %d\n", ui->data_len);
273 273
274 if (!S_ISDIR(inode->i_mode)) 274 if (!S_ISDIR(inode->i_mode))
275 return; 275 return;
276 276
277 printk(KERN_ERR "List of directory entries:\n"); 277 pr_err("List of directory entries:\n");
278 ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); 278 ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
279 279
280 lowest_dent_key(c, &key, inode->i_ino); 280 lowest_dent_key(c, &key, inode->i_ino);
@@ -282,11 +282,11 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
282 dent = ubifs_tnc_next_ent(c, &key, &nm); 282 dent = ubifs_tnc_next_ent(c, &key, &nm);
283 if (IS_ERR(dent)) { 283 if (IS_ERR(dent)) {
284 if (PTR_ERR(dent) != -ENOENT) 284 if (PTR_ERR(dent) != -ENOENT)
285 printk(KERN_ERR "error %ld\n", PTR_ERR(dent)); 285 pr_err("error %ld\n", PTR_ERR(dent));
286 break; 286 break;
287 } 287 }
288 288
289 printk(KERN_ERR "\t%d: %s (%s)\n", 289 pr_err("\t%d: %s (%s)\n",
290 count++, dent->name, get_dent_type(dent->type)); 290 count++, dent->name, get_dent_type(dent->type));
291 291
292 nm.name = dent->name; 292 nm.name = dent->name;
@@ -305,12 +305,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
305 const struct ubifs_ch *ch = node; 305 const struct ubifs_ch *ch = node;
306 char key_buf[DBG_KEY_BUF_LEN]; 306 char key_buf[DBG_KEY_BUF_LEN];
307 307
308 if (dbg_is_tst_rcvry(c))
309 return;
310
311 /* If the magic is incorrect, just hexdump the first bytes */ 308 /* If the magic is incorrect, just hexdump the first bytes */
312 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { 309 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
313 printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ); 310 pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ);
314 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, 311 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
315 (void *)node, UBIFS_CH_SZ, 1); 312 (void *)node, UBIFS_CH_SZ, 1);
316 return; 313 return;
@@ -324,8 +321,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
324 { 321 {
325 const struct ubifs_pad_node *pad = node; 322 const struct ubifs_pad_node *pad = node;
326 323
327 printk(KERN_ERR "\tpad_len %u\n", 324 pr_err("\tpad_len %u\n", le32_to_cpu(pad->pad_len));
328 le32_to_cpu(pad->pad_len));
329 break; 325 break;
330 } 326 }
331 case UBIFS_SB_NODE: 327 case UBIFS_SB_NODE:
@@ -333,112 +329,77 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
333 const struct ubifs_sb_node *sup = node; 329 const struct ubifs_sb_node *sup = node;
334 unsigned int sup_flags = le32_to_cpu(sup->flags); 330 unsigned int sup_flags = le32_to_cpu(sup->flags);
335 331
336 printk(KERN_ERR "\tkey_hash %d (%s)\n", 332 pr_err("\tkey_hash %d (%s)\n",
337 (int)sup->key_hash, get_key_hash(sup->key_hash)); 333 (int)sup->key_hash, get_key_hash(sup->key_hash));
338 printk(KERN_ERR "\tkey_fmt %d (%s)\n", 334 pr_err("\tkey_fmt %d (%s)\n",
339 (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); 335 (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
340 printk(KERN_ERR "\tflags %#x\n", sup_flags); 336 pr_err("\tflags %#x\n", sup_flags);
341 printk(KERN_ERR "\t big_lpt %u\n", 337 pr_err("\t big_lpt %u\n",
342 !!(sup_flags & UBIFS_FLG_BIGLPT)); 338 !!(sup_flags & UBIFS_FLG_BIGLPT));
343 printk(KERN_ERR "\t space_fixup %u\n", 339 pr_err("\t space_fixup %u\n",
344 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); 340 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
345 printk(KERN_ERR "\tmin_io_size %u\n", 341 pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size));
346 le32_to_cpu(sup->min_io_size)); 342 pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size));
347 printk(KERN_ERR "\tleb_size %u\n", 343 pr_err("\tleb_cnt %u\n", le32_to_cpu(sup->leb_cnt));
348 le32_to_cpu(sup->leb_size)); 344 pr_err("\tmax_leb_cnt %u\n", le32_to_cpu(sup->max_leb_cnt));
349 printk(KERN_ERR "\tleb_cnt %u\n", 345 pr_err("\tmax_bud_bytes %llu\n",
350 le32_to_cpu(sup->leb_cnt));
351 printk(KERN_ERR "\tmax_leb_cnt %u\n",
352 le32_to_cpu(sup->max_leb_cnt));
353 printk(KERN_ERR "\tmax_bud_bytes %llu\n",
354 (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); 346 (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
355 printk(KERN_ERR "\tlog_lebs %u\n", 347 pr_err("\tlog_lebs %u\n", le32_to_cpu(sup->log_lebs));
356 le32_to_cpu(sup->log_lebs)); 348 pr_err("\tlpt_lebs %u\n", le32_to_cpu(sup->lpt_lebs));
357 printk(KERN_ERR "\tlpt_lebs %u\n", 349 pr_err("\torph_lebs %u\n", le32_to_cpu(sup->orph_lebs));
358 le32_to_cpu(sup->lpt_lebs)); 350 pr_err("\tjhead_cnt %u\n", le32_to_cpu(sup->jhead_cnt));
359 printk(KERN_ERR "\torph_lebs %u\n", 351 pr_err("\tfanout %u\n", le32_to_cpu(sup->fanout));
360 le32_to_cpu(sup->orph_lebs)); 352 pr_err("\tlsave_cnt %u\n", le32_to_cpu(sup->lsave_cnt));
361 printk(KERN_ERR "\tjhead_cnt %u\n", 353 pr_err("\tdefault_compr %u\n",
362 le32_to_cpu(sup->jhead_cnt));
363 printk(KERN_ERR "\tfanout %u\n",
364 le32_to_cpu(sup->fanout));
365 printk(KERN_ERR "\tlsave_cnt %u\n",
366 le32_to_cpu(sup->lsave_cnt));
367 printk(KERN_ERR "\tdefault_compr %u\n",
368 (int)le16_to_cpu(sup->default_compr)); 354 (int)le16_to_cpu(sup->default_compr));
369 printk(KERN_ERR "\trp_size %llu\n", 355 pr_err("\trp_size %llu\n",
370 (unsigned long long)le64_to_cpu(sup->rp_size)); 356 (unsigned long long)le64_to_cpu(sup->rp_size));
371 printk(KERN_ERR "\trp_uid %u\n", 357 pr_err("\trp_uid %u\n", le32_to_cpu(sup->rp_uid));
372 le32_to_cpu(sup->rp_uid)); 358 pr_err("\trp_gid %u\n", le32_to_cpu(sup->rp_gid));
373 printk(KERN_ERR "\trp_gid %u\n", 359 pr_err("\tfmt_version %u\n", le32_to_cpu(sup->fmt_version));
374 le32_to_cpu(sup->rp_gid)); 360 pr_err("\ttime_gran %u\n", le32_to_cpu(sup->time_gran));
375 printk(KERN_ERR "\tfmt_version %u\n", 361 pr_err("\tUUID %pUB\n", sup->uuid);
376 le32_to_cpu(sup->fmt_version));
377 printk(KERN_ERR "\ttime_gran %u\n",
378 le32_to_cpu(sup->time_gran));
379 printk(KERN_ERR "\tUUID %pUB\n",
380 sup->uuid);
381 break; 362 break;
382 } 363 }
383 case UBIFS_MST_NODE: 364 case UBIFS_MST_NODE:
384 { 365 {
385 const struct ubifs_mst_node *mst = node; 366 const struct ubifs_mst_node *mst = node;
386 367
387 printk(KERN_ERR "\thighest_inum %llu\n", 368 pr_err("\thighest_inum %llu\n",
388 (unsigned long long)le64_to_cpu(mst->highest_inum)); 369 (unsigned long long)le64_to_cpu(mst->highest_inum));
389 printk(KERN_ERR "\tcommit number %llu\n", 370 pr_err("\tcommit number %llu\n",
390 (unsigned long long)le64_to_cpu(mst->cmt_no)); 371 (unsigned long long)le64_to_cpu(mst->cmt_no));
391 printk(KERN_ERR "\tflags %#x\n", 372 pr_err("\tflags %#x\n", le32_to_cpu(mst->flags));
392 le32_to_cpu(mst->flags)); 373 pr_err("\tlog_lnum %u\n", le32_to_cpu(mst->log_lnum));
393 printk(KERN_ERR "\tlog_lnum %u\n", 374 pr_err("\troot_lnum %u\n", le32_to_cpu(mst->root_lnum));
394 le32_to_cpu(mst->log_lnum)); 375 pr_err("\troot_offs %u\n", le32_to_cpu(mst->root_offs));
395 printk(KERN_ERR "\troot_lnum %u\n", 376 pr_err("\troot_len %u\n", le32_to_cpu(mst->root_len));
396 le32_to_cpu(mst->root_lnum)); 377 pr_err("\tgc_lnum %u\n", le32_to_cpu(mst->gc_lnum));
397 printk(KERN_ERR "\troot_offs %u\n", 378 pr_err("\tihead_lnum %u\n", le32_to_cpu(mst->ihead_lnum));
398 le32_to_cpu(mst->root_offs)); 379 pr_err("\tihead_offs %u\n", le32_to_cpu(mst->ihead_offs));
399 printk(KERN_ERR "\troot_len %u\n", 380 pr_err("\tindex_size %llu\n",
400 le32_to_cpu(mst->root_len));
401 printk(KERN_ERR "\tgc_lnum %u\n",
402 le32_to_cpu(mst->gc_lnum));
403 printk(KERN_ERR "\tihead_lnum %u\n",
404 le32_to_cpu(mst->ihead_lnum));
405 printk(KERN_ERR "\tihead_offs %u\n",
406 le32_to_cpu(mst->ihead_offs));
407 printk(KERN_ERR "\tindex_size %llu\n",
408 (unsigned long long)le64_to_cpu(mst->index_size)); 381 (unsigned long long)le64_to_cpu(mst->index_size));
409 printk(KERN_ERR "\tlpt_lnum %u\n", 382 pr_err("\tlpt_lnum %u\n", le32_to_cpu(mst->lpt_lnum));
410 le32_to_cpu(mst->lpt_lnum)); 383 pr_err("\tlpt_offs %u\n", le32_to_cpu(mst->lpt_offs));
411 printk(KERN_ERR "\tlpt_offs %u\n", 384 pr_err("\tnhead_lnum %u\n", le32_to_cpu(mst->nhead_lnum));
412 le32_to_cpu(mst->lpt_offs)); 385 pr_err("\tnhead_offs %u\n", le32_to_cpu(mst->nhead_offs));
413 printk(KERN_ERR "\tnhead_lnum %u\n", 386 pr_err("\tltab_lnum %u\n", le32_to_cpu(mst->ltab_lnum));
414 le32_to_cpu(mst->nhead_lnum)); 387 pr_err("\tltab_offs %u\n", le32_to_cpu(mst->ltab_offs));
415 printk(KERN_ERR "\tnhead_offs %u\n", 388 pr_err("\tlsave_lnum %u\n", le32_to_cpu(mst->lsave_lnum));
416 le32_to_cpu(mst->nhead_offs)); 389 pr_err("\tlsave_offs %u\n", le32_to_cpu(mst->lsave_offs));
417 printk(KERN_ERR "\tltab_lnum %u\n", 390 pr_err("\tlscan_lnum %u\n", le32_to_cpu(mst->lscan_lnum));
418 le32_to_cpu(mst->ltab_lnum)); 391 pr_err("\tleb_cnt %u\n", le32_to_cpu(mst->leb_cnt));
419 printk(KERN_ERR "\tltab_offs %u\n", 392 pr_err("\tempty_lebs %u\n", le32_to_cpu(mst->empty_lebs));
420 le32_to_cpu(mst->ltab_offs)); 393 pr_err("\tidx_lebs %u\n", le32_to_cpu(mst->idx_lebs));
421 printk(KERN_ERR "\tlsave_lnum %u\n", 394 pr_err("\ttotal_free %llu\n",
422 le32_to_cpu(mst->lsave_lnum));
423 printk(KERN_ERR "\tlsave_offs %u\n",
424 le32_to_cpu(mst->lsave_offs));
425 printk(KERN_ERR "\tlscan_lnum %u\n",
426 le32_to_cpu(mst->lscan_lnum));
427 printk(KERN_ERR "\tleb_cnt %u\n",
428 le32_to_cpu(mst->leb_cnt));
429 printk(KERN_ERR "\tempty_lebs %u\n",
430 le32_to_cpu(mst->empty_lebs));
431 printk(KERN_ERR "\tidx_lebs %u\n",
432 le32_to_cpu(mst->idx_lebs));
433 printk(KERN_ERR "\ttotal_free %llu\n",
434 (unsigned long long)le64_to_cpu(mst->total_free)); 395 (unsigned long long)le64_to_cpu(mst->total_free));
435 printk(KERN_ERR "\ttotal_dirty %llu\n", 396 pr_err("\ttotal_dirty %llu\n",
436 (unsigned long long)le64_to_cpu(mst->total_dirty)); 397 (unsigned long long)le64_to_cpu(mst->total_dirty));
437 printk(KERN_ERR "\ttotal_used %llu\n", 398 pr_err("\ttotal_used %llu\n",
438 (unsigned long long)le64_to_cpu(mst->total_used)); 399 (unsigned long long)le64_to_cpu(mst->total_used));
439 printk(KERN_ERR "\ttotal_dead %llu\n", 400 pr_err("\ttotal_dead %llu\n",
440 (unsigned long long)le64_to_cpu(mst->total_dead)); 401 (unsigned long long)le64_to_cpu(mst->total_dead));
441 printk(KERN_ERR "\ttotal_dark %llu\n", 402 pr_err("\ttotal_dark %llu\n",
442 (unsigned long long)le64_to_cpu(mst->total_dark)); 403 (unsigned long long)le64_to_cpu(mst->total_dark));
443 break; 404 break;
444 } 405 }
@@ -446,12 +407,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
446 { 407 {
447 const struct ubifs_ref_node *ref = node; 408 const struct ubifs_ref_node *ref = node;
448 409
449 printk(KERN_ERR "\tlnum %u\n", 410 pr_err("\tlnum %u\n", le32_to_cpu(ref->lnum));
450 le32_to_cpu(ref->lnum)); 411 pr_err("\toffs %u\n", le32_to_cpu(ref->offs));
451 printk(KERN_ERR "\toffs %u\n", 412 pr_err("\tjhead %u\n", le32_to_cpu(ref->jhead));
452 le32_to_cpu(ref->offs));
453 printk(KERN_ERR "\tjhead %u\n",
454 le32_to_cpu(ref->jhead));
455 break; 413 break;
456 } 414 }
457 case UBIFS_INO_NODE: 415 case UBIFS_INO_NODE:
@@ -459,41 +417,32 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
459 const struct ubifs_ino_node *ino = node; 417 const struct ubifs_ino_node *ino = node;
460 418
461 key_read(c, &ino->key, &key); 419 key_read(c, &ino->key, &key);
462 printk(KERN_ERR "\tkey %s\n", 420 pr_err("\tkey %s\n",
463 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 421 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
464 printk(KERN_ERR "\tcreat_sqnum %llu\n", 422 pr_err("\tcreat_sqnum %llu\n",
465 (unsigned long long)le64_to_cpu(ino->creat_sqnum)); 423 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
466 printk(KERN_ERR "\tsize %llu\n", 424 pr_err("\tsize %llu\n",
467 (unsigned long long)le64_to_cpu(ino->size)); 425 (unsigned long long)le64_to_cpu(ino->size));
468 printk(KERN_ERR "\tnlink %u\n", 426 pr_err("\tnlink %u\n", le32_to_cpu(ino->nlink));
469 le32_to_cpu(ino->nlink)); 427 pr_err("\tatime %lld.%u\n",
470 printk(KERN_ERR "\tatime %lld.%u\n",
471 (long long)le64_to_cpu(ino->atime_sec), 428 (long long)le64_to_cpu(ino->atime_sec),
472 le32_to_cpu(ino->atime_nsec)); 429 le32_to_cpu(ino->atime_nsec));
473 printk(KERN_ERR "\tmtime %lld.%u\n", 430 pr_err("\tmtime %lld.%u\n",
474 (long long)le64_to_cpu(ino->mtime_sec), 431 (long long)le64_to_cpu(ino->mtime_sec),
475 le32_to_cpu(ino->mtime_nsec)); 432 le32_to_cpu(ino->mtime_nsec));
476 printk(KERN_ERR "\tctime %lld.%u\n", 433 pr_err("\tctime %lld.%u\n",
477 (long long)le64_to_cpu(ino->ctime_sec), 434 (long long)le64_to_cpu(ino->ctime_sec),
478 le32_to_cpu(ino->ctime_nsec)); 435 le32_to_cpu(ino->ctime_nsec));
479 printk(KERN_ERR "\tuid %u\n", 436 pr_err("\tuid %u\n", le32_to_cpu(ino->uid));
480 le32_to_cpu(ino->uid)); 437 pr_err("\tgid %u\n", le32_to_cpu(ino->gid));
481 printk(KERN_ERR "\tgid %u\n", 438 pr_err("\tmode %u\n", le32_to_cpu(ino->mode));
482 le32_to_cpu(ino->gid)); 439 pr_err("\tflags %#x\n", le32_to_cpu(ino->flags));
483 printk(KERN_ERR "\tmode %u\n", 440 pr_err("\txattr_cnt %u\n", le32_to_cpu(ino->xattr_cnt));
484 le32_to_cpu(ino->mode)); 441 pr_err("\txattr_size %u\n", le32_to_cpu(ino->xattr_size));
485 printk(KERN_ERR "\tflags %#x\n", 442 pr_err("\txattr_names %u\n", le32_to_cpu(ino->xattr_names));
486 le32_to_cpu(ino->flags)); 443 pr_err("\tcompr_type %#x\n",
487 printk(KERN_ERR "\txattr_cnt %u\n",
488 le32_to_cpu(ino->xattr_cnt));
489 printk(KERN_ERR "\txattr_size %u\n",
490 le32_to_cpu(ino->xattr_size));
491 printk(KERN_ERR "\txattr_names %u\n",
492 le32_to_cpu(ino->xattr_names));
493 printk(KERN_ERR "\tcompr_type %#x\n",
494 (int)le16_to_cpu(ino->compr_type)); 444 (int)le16_to_cpu(ino->compr_type));
495 printk(KERN_ERR "\tdata len %u\n", 445 pr_err("\tdata len %u\n", le32_to_cpu(ino->data_len));
496 le32_to_cpu(ino->data_len));
497 break; 446 break;
498 } 447 }
499 case UBIFS_DENT_NODE: 448 case UBIFS_DENT_NODE:
@@ -503,22 +452,21 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
503 int nlen = le16_to_cpu(dent->nlen); 452 int nlen = le16_to_cpu(dent->nlen);
504 453
505 key_read(c, &dent->key, &key); 454 key_read(c, &dent->key, &key);
506 printk(KERN_ERR "\tkey %s\n", 455 pr_err("\tkey %s\n",
507 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 456 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
508 printk(KERN_ERR "\tinum %llu\n", 457 pr_err("\tinum %llu\n",
509 (unsigned long long)le64_to_cpu(dent->inum)); 458 (unsigned long long)le64_to_cpu(dent->inum));
510 printk(KERN_ERR "\ttype %d\n", (int)dent->type); 459 pr_err("\ttype %d\n", (int)dent->type);
511 printk(KERN_ERR "\tnlen %d\n", nlen); 460 pr_err("\tnlen %d\n", nlen);
512 printk(KERN_ERR "\tname "); 461 pr_err("\tname ");
513 462
514 if (nlen > UBIFS_MAX_NLEN) 463 if (nlen > UBIFS_MAX_NLEN)
515 printk(KERN_ERR "(bad name length, not printing, " 464 pr_err("(bad name length, not printing, bad or corrupted node)");
516 "bad or corrupted node)");
517 else { 465 else {
518 for (i = 0; i < nlen && dent->name[i]; i++) 466 for (i = 0; i < nlen && dent->name[i]; i++)
519 printk(KERN_CONT "%c", dent->name[i]); 467 pr_cont("%c", dent->name[i]);
520 } 468 }
521 printk(KERN_CONT "\n"); 469 pr_cont("\n");
522 470
523 break; 471 break;
524 } 472 }
@@ -528,15 +476,13 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
528 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; 476 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
529 477
530 key_read(c, &dn->key, &key); 478 key_read(c, &dn->key, &key);
531 printk(KERN_ERR "\tkey %s\n", 479 pr_err("\tkey %s\n",
532 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 480 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
533 printk(KERN_ERR "\tsize %u\n", 481 pr_err("\tsize %u\n", le32_to_cpu(dn->size));
534 le32_to_cpu(dn->size)); 482 pr_err("\tcompr_typ %d\n",
535 printk(KERN_ERR "\tcompr_typ %d\n",
536 (int)le16_to_cpu(dn->compr_type)); 483 (int)le16_to_cpu(dn->compr_type));
537 printk(KERN_ERR "\tdata size %d\n", 484 pr_err("\tdata size %d\n", dlen);
538 dlen); 485 pr_err("\tdata:\n");
539 printk(KERN_ERR "\tdata:\n");
540 print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, 486 print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
541 (void *)&dn->data, dlen, 0); 487 (void *)&dn->data, dlen, 0);
542 break; 488 break;
@@ -545,11 +491,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
545 { 491 {
546 const struct ubifs_trun_node *trun = node; 492 const struct ubifs_trun_node *trun = node;
547 493
548 printk(KERN_ERR "\tinum %u\n", 494 pr_err("\tinum %u\n", le32_to_cpu(trun->inum));
549 le32_to_cpu(trun->inum)); 495 pr_err("\told_size %llu\n",
550 printk(KERN_ERR "\told_size %llu\n",
551 (unsigned long long)le64_to_cpu(trun->old_size)); 496 (unsigned long long)le64_to_cpu(trun->old_size));
552 printk(KERN_ERR "\tnew_size %llu\n", 497 pr_err("\tnew_size %llu\n",
553 (unsigned long long)le64_to_cpu(trun->new_size)); 498 (unsigned long long)le64_to_cpu(trun->new_size));
554 break; 499 break;
555 } 500 }
@@ -558,17 +503,16 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
558 const struct ubifs_idx_node *idx = node; 503 const struct ubifs_idx_node *idx = node;
559 504
560 n = le16_to_cpu(idx->child_cnt); 505 n = le16_to_cpu(idx->child_cnt);
561 printk(KERN_ERR "\tchild_cnt %d\n", n); 506 pr_err("\tchild_cnt %d\n", n);
562 printk(KERN_ERR "\tlevel %d\n", 507 pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level));
563 (int)le16_to_cpu(idx->level)); 508 pr_err("\tBranches:\n");
564 printk(KERN_ERR "\tBranches:\n");
565 509
566 for (i = 0; i < n && i < c->fanout - 1; i++) { 510 for (i = 0; i < n && i < c->fanout - 1; i++) {
567 const struct ubifs_branch *br; 511 const struct ubifs_branch *br;
568 512
569 br = ubifs_idx_branch(c, idx, i); 513 br = ubifs_idx_branch(c, idx, i);
570 key_read(c, &br->key, &key); 514 key_read(c, &br->key, &key);
571 printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n", 515 pr_err("\t%d: LEB %d:%d len %d key %s\n",
572 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), 516 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
573 le32_to_cpu(br->len), 517 le32_to_cpu(br->len),
574 dbg_snprintf_key(c, &key, key_buf, 518 dbg_snprintf_key(c, &key, key_buf,
@@ -582,20 +526,20 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
582 { 526 {
583 const struct ubifs_orph_node *orph = node; 527 const struct ubifs_orph_node *orph = node;
584 528
585 printk(KERN_ERR "\tcommit number %llu\n", 529 pr_err("\tcommit number %llu\n",
586 (unsigned long long) 530 (unsigned long long)
587 le64_to_cpu(orph->cmt_no) & LLONG_MAX); 531 le64_to_cpu(orph->cmt_no) & LLONG_MAX);
588 printk(KERN_ERR "\tlast node flag %llu\n", 532 pr_err("\tlast node flag %llu\n",
589 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); 533 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
590 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; 534 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
591 printk(KERN_ERR "\t%d orphan inode numbers:\n", n); 535 pr_err("\t%d orphan inode numbers:\n", n);
592 for (i = 0; i < n; i++) 536 for (i = 0; i < n; i++)
593 printk(KERN_ERR "\t ino %llu\n", 537 pr_err("\t ino %llu\n",
594 (unsigned long long)le64_to_cpu(orph->inos[i])); 538 (unsigned long long)le64_to_cpu(orph->inos[i]));
595 break; 539 break;
596 } 540 }
597 default: 541 default:
598 printk(KERN_ERR "node type %d was not recognized\n", 542 pr_err("node type %d was not recognized\n",
599 (int)ch->node_type); 543 (int)ch->node_type);
600 } 544 }
601 spin_unlock(&dbg_lock); 545 spin_unlock(&dbg_lock);
@@ -604,16 +548,16 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
604void ubifs_dump_budget_req(const struct ubifs_budget_req *req) 548void ubifs_dump_budget_req(const struct ubifs_budget_req *req)
605{ 549{
606 spin_lock(&dbg_lock); 550 spin_lock(&dbg_lock);
607 printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n", 551 pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n",
608 req->new_ino, req->dirtied_ino); 552 req->new_ino, req->dirtied_ino);
609 printk(KERN_ERR "\tnew_ino_d %d, dirtied_ino_d %d\n", 553 pr_err("\tnew_ino_d %d, dirtied_ino_d %d\n",
610 req->new_ino_d, req->dirtied_ino_d); 554 req->new_ino_d, req->dirtied_ino_d);
611 printk(KERN_ERR "\tnew_page %d, dirtied_page %d\n", 555 pr_err("\tnew_page %d, dirtied_page %d\n",
612 req->new_page, req->dirtied_page); 556 req->new_page, req->dirtied_page);
613 printk(KERN_ERR "\tnew_dent %d, mod_dent %d\n", 557 pr_err("\tnew_dent %d, mod_dent %d\n",
614 req->new_dent, req->mod_dent); 558 req->new_dent, req->mod_dent);
615 printk(KERN_ERR "\tidx_growth %d\n", req->idx_growth); 559 pr_err("\tidx_growth %d\n", req->idx_growth);
616 printk(KERN_ERR "\tdata_growth %d dd_growth %d\n", 560 pr_err("\tdata_growth %d dd_growth %d\n",
617 req->data_growth, req->dd_growth); 561 req->data_growth, req->dd_growth);
618 spin_unlock(&dbg_lock); 562 spin_unlock(&dbg_lock);
619} 563}
@@ -621,14 +565,12 @@ void ubifs_dump_budget_req(const struct ubifs_budget_req *req)
621void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) 565void ubifs_dump_lstats(const struct ubifs_lp_stats *lst)
622{ 566{
623 spin_lock(&dbg_lock); 567 spin_lock(&dbg_lock);
624 printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, " 568 pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs %d\n",
625 "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); 569 current->pid, lst->empty_lebs, lst->idx_lebs);
626 printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, " 570 pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n",
627 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, 571 lst->taken_empty_lebs, lst->total_free, lst->total_dirty);
628 lst->total_dirty); 572 pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n",
629 printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, " 573 lst->total_used, lst->total_dark, lst->total_dead);
630 "total_dead %lld\n", lst->total_used, lst->total_dark,
631 lst->total_dead);
632 spin_unlock(&dbg_lock); 574 spin_unlock(&dbg_lock);
633} 575}
634 576
@@ -642,21 +584,17 @@ void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
642 584
643 spin_lock(&c->space_lock); 585 spin_lock(&c->space_lock);
644 spin_lock(&dbg_lock); 586 spin_lock(&dbg_lock);
645 printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, " 587 pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n",
646 "total budget sum %lld\n", current->pid, 588 current->pid, bi->data_growth + bi->dd_growth,
647 bi->data_growth + bi->dd_growth,
648 bi->data_growth + bi->dd_growth + bi->idx_growth); 589 bi->data_growth + bi->dd_growth + bi->idx_growth);
649 printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, " 590 pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n",
650 "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, 591 bi->data_growth, bi->dd_growth, bi->idx_growth);
651 bi->idx_growth); 592 pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n",
652 printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, " 593 bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx);
653 "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, 594 pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n",
654 bi->uncommitted_idx);
655 printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
656 bi->page_budget, bi->inode_budget, bi->dent_budget); 595 bi->page_budget, bi->inode_budget, bi->dent_budget);
657 printk(KERN_ERR "\tnospace %u, nospace_rp %u\n", 596 pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp);
658 bi->nospace, bi->nospace_rp); 597 pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
659 printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
660 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 598 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
661 599
662 if (bi != &c->bi) 600 if (bi != &c->bi)
@@ -667,38 +605,37 @@ void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
667 */ 605 */
668 goto out_unlock; 606 goto out_unlock;
669 607
670 printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", 608 pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
671 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); 609 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
672 printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " 610 pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n",
673 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), 611 atomic_long_read(&c->dirty_pg_cnt),
674 atomic_long_read(&c->dirty_zn_cnt), 612 atomic_long_read(&c->dirty_zn_cnt),
675 atomic_long_read(&c->clean_zn_cnt)); 613 atomic_long_read(&c->clean_zn_cnt));
676 printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n", 614 pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum);
677 c->gc_lnum, c->ihead_lnum);
678 615
679 /* If we are in R/O mode, journal heads do not exist */ 616 /* If we are in R/O mode, journal heads do not exist */
680 if (c->jheads) 617 if (c->jheads)
681 for (i = 0; i < c->jhead_cnt; i++) 618 for (i = 0; i < c->jhead_cnt; i++)
682 printk(KERN_ERR "\tjhead %s\t LEB %d\n", 619 pr_err("\tjhead %s\t LEB %d\n",
683 dbg_jhead(c->jheads[i].wbuf.jhead), 620 dbg_jhead(c->jheads[i].wbuf.jhead),
684 c->jheads[i].wbuf.lnum); 621 c->jheads[i].wbuf.lnum);
685 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 622 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
686 bud = rb_entry(rb, struct ubifs_bud, rb); 623 bud = rb_entry(rb, struct ubifs_bud, rb);
687 printk(KERN_ERR "\tbud LEB %d\n", bud->lnum); 624 pr_err("\tbud LEB %d\n", bud->lnum);
688 } 625 }
689 list_for_each_entry(bud, &c->old_buds, list) 626 list_for_each_entry(bud, &c->old_buds, list)
690 printk(KERN_ERR "\told bud LEB %d\n", bud->lnum); 627 pr_err("\told bud LEB %d\n", bud->lnum);
691 list_for_each_entry(idx_gc, &c->idx_gc, list) 628 list_for_each_entry(idx_gc, &c->idx_gc, list)
692 printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n", 629 pr_err("\tGC'ed idx LEB %d unmap %d\n",
693 idx_gc->lnum, idx_gc->unmap); 630 idx_gc->lnum, idx_gc->unmap);
694 printk(KERN_ERR "\tcommit state %d\n", c->cmt_state); 631 pr_err("\tcommit state %d\n", c->cmt_state);
695 632
696 /* Print budgeting predictions */ 633 /* Print budgeting predictions */
697 available = ubifs_calc_available(c, c->bi.min_idx_lebs); 634 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
698 outstanding = c->bi.data_growth + c->bi.dd_growth; 635 outstanding = c->bi.data_growth + c->bi.dd_growth;
699 free = ubifs_get_free_space_nolock(c); 636 free = ubifs_get_free_space_nolock(c);
700 printk(KERN_ERR "Budgeting predictions:\n"); 637 pr_err("Budgeting predictions:\n");
701 printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n", 638 pr_err("\tavailable: %lld, outstanding %lld, free %lld\n",
702 available, outstanding, free); 639 available, outstanding, free);
703out_unlock: 640out_unlock:
704 spin_unlock(&dbg_lock); 641 spin_unlock(&dbg_lock);
@@ -718,21 +655,19 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
718 dark = ubifs_calc_dark(c, spc); 655 dark = ubifs_calc_dark(c, spc);
719 656
720 if (lp->flags & LPROPS_INDEX) 657 if (lp->flags & LPROPS_INDEX)
721 printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " 658 pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (",
722 "free + dirty %-8d flags %#x (", lp->lnum, lp->free, 659 lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
723 lp->dirty, c->leb_size - spc, spc, lp->flags); 660 lp->flags);
724 else 661 else
725 printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " 662 pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (",
726 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " 663 lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
727 "flags %#-4x (", lp->lnum, lp->free, lp->dirty, 664 dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
728 c->leb_size - spc, spc, dark, dead,
729 (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
730 665
731 if (lp->flags & LPROPS_TAKEN) { 666 if (lp->flags & LPROPS_TAKEN) {
732 if (lp->flags & LPROPS_INDEX) 667 if (lp->flags & LPROPS_INDEX)
733 printk(KERN_CONT "index, taken"); 668 pr_cont("index, taken");
734 else 669 else
735 printk(KERN_CONT "taken"); 670 pr_cont("taken");
736 } else { 671 } else {
737 const char *s; 672 const char *s;
738 673
@@ -769,7 +704,7 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
769 break; 704 break;
770 } 705 }
771 } 706 }
772 printk(KERN_CONT "%s", s); 707 pr_cont("%s", s);
773 } 708 }
774 709
775 for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { 710 for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
@@ -784,19 +719,18 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
784 */ 719 */
785 if (c->jheads && 720 if (c->jheads &&
786 lp->lnum == c->jheads[i].wbuf.lnum) { 721 lp->lnum == c->jheads[i].wbuf.lnum) {
787 printk(KERN_CONT ", jhead %s", 722 pr_cont(", jhead %s", dbg_jhead(i));
788 dbg_jhead(i));
789 head = 1; 723 head = 1;
790 } 724 }
791 } 725 }
792 if (!head) 726 if (!head)
793 printk(KERN_CONT ", bud of jhead %s", 727 pr_cont(", bud of jhead %s",
794 dbg_jhead(bud->jhead)); 728 dbg_jhead(bud->jhead));
795 } 729 }
796 } 730 }
797 if (lp->lnum == c->gc_lnum) 731 if (lp->lnum == c->gc_lnum)
798 printk(KERN_CONT ", GC LEB"); 732 pr_cont(", GC LEB");
799 printk(KERN_CONT ")\n"); 733 pr_cont(")\n");
800} 734}
801 735
802void ubifs_dump_lprops(struct ubifs_info *c) 736void ubifs_dump_lprops(struct ubifs_info *c)
@@ -805,8 +739,7 @@ void ubifs_dump_lprops(struct ubifs_info *c)
805 struct ubifs_lprops lp; 739 struct ubifs_lprops lp;
806 struct ubifs_lp_stats lst; 740 struct ubifs_lp_stats lst;
807 741
808 printk(KERN_ERR "(pid %d) start dumping LEB properties\n", 742 pr_err("(pid %d) start dumping LEB properties\n", current->pid);
809 current->pid);
810 ubifs_get_lp_stats(c, &lst); 743 ubifs_get_lp_stats(c, &lst);
811 ubifs_dump_lstats(&lst); 744 ubifs_dump_lstats(&lst);
812 745
@@ -817,8 +750,7 @@ void ubifs_dump_lprops(struct ubifs_info *c)
817 750
818 ubifs_dump_lprop(c, &lp); 751 ubifs_dump_lprop(c, &lp);
819 } 752 }
820 printk(KERN_ERR "(pid %d) finish dumping LEB properties\n", 753 pr_err("(pid %d) finish dumping LEB properties\n", current->pid);
821 current->pid);
822} 754}
823 755
824void ubifs_dump_lpt_info(struct ubifs_info *c) 756void ubifs_dump_lpt_info(struct ubifs_info *c)
@@ -826,37 +758,36 @@ void ubifs_dump_lpt_info(struct ubifs_info *c)
826 int i; 758 int i;
827 759
828 spin_lock(&dbg_lock); 760 spin_lock(&dbg_lock);
829 printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid); 761 pr_err("(pid %d) dumping LPT information\n", current->pid);
830 printk(KERN_ERR "\tlpt_sz: %lld\n", c->lpt_sz); 762 pr_err("\tlpt_sz: %lld\n", c->lpt_sz);
831 printk(KERN_ERR "\tpnode_sz: %d\n", c->pnode_sz); 763 pr_err("\tpnode_sz: %d\n", c->pnode_sz);
832 printk(KERN_ERR "\tnnode_sz: %d\n", c->nnode_sz); 764 pr_err("\tnnode_sz: %d\n", c->nnode_sz);
833 printk(KERN_ERR "\tltab_sz: %d\n", c->ltab_sz); 765 pr_err("\tltab_sz: %d\n", c->ltab_sz);
834 printk(KERN_ERR "\tlsave_sz: %d\n", c->lsave_sz); 766 pr_err("\tlsave_sz: %d\n", c->lsave_sz);
835 printk(KERN_ERR "\tbig_lpt: %d\n", c->big_lpt); 767 pr_err("\tbig_lpt: %d\n", c->big_lpt);
836 printk(KERN_ERR "\tlpt_hght: %d\n", c->lpt_hght); 768 pr_err("\tlpt_hght: %d\n", c->lpt_hght);
837 printk(KERN_ERR "\tpnode_cnt: %d\n", c->pnode_cnt); 769 pr_err("\tpnode_cnt: %d\n", c->pnode_cnt);
838 printk(KERN_ERR "\tnnode_cnt: %d\n", c->nnode_cnt); 770 pr_err("\tnnode_cnt: %d\n", c->nnode_cnt);
839 printk(KERN_ERR "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); 771 pr_err("\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt);
840 printk(KERN_ERR "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); 772 pr_err("\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt);
841 printk(KERN_ERR "\tlsave_cnt: %d\n", c->lsave_cnt); 773 pr_err("\tlsave_cnt: %d\n", c->lsave_cnt);
842 printk(KERN_ERR "\tspace_bits: %d\n", c->space_bits); 774 pr_err("\tspace_bits: %d\n", c->space_bits);
843 printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); 775 pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
844 printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); 776 pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
845 printk(KERN_ERR "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); 777 pr_err("\tlpt_spc_bits: %d\n", c->lpt_spc_bits);
846 printk(KERN_ERR "\tpcnt_bits: %d\n", c->pcnt_bits); 778 pr_err("\tpcnt_bits: %d\n", c->pcnt_bits);
847 printk(KERN_ERR "\tlnum_bits: %d\n", c->lnum_bits); 779 pr_err("\tlnum_bits: %d\n", c->lnum_bits);
848 printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); 780 pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
849 printk(KERN_ERR "\tLPT head is at %d:%d\n", 781 pr_err("\tLPT head is at %d:%d\n",
850 c->nhead_lnum, c->nhead_offs); 782 c->nhead_lnum, c->nhead_offs);
851 printk(KERN_ERR "\tLPT ltab is at %d:%d\n", 783 pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
852 c->ltab_lnum, c->ltab_offs);
853 if (c->big_lpt) 784 if (c->big_lpt)
854 printk(KERN_ERR "\tLPT lsave is at %d:%d\n", 785 pr_err("\tLPT lsave is at %d:%d\n",
855 c->lsave_lnum, c->lsave_offs); 786 c->lsave_lnum, c->lsave_offs);
856 for (i = 0; i < c->lpt_lebs; i++) 787 for (i = 0; i < c->lpt_lebs; i++)
857 printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d " 788 pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n",
858 "cmt %d\n", i + c->lpt_first, c->ltab[i].free, 789 i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty,
859 c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); 790 c->ltab[i].tgc, c->ltab[i].cmt);
860 spin_unlock(&dbg_lock); 791 spin_unlock(&dbg_lock);
861} 792}
862 793
@@ -865,13 +796,13 @@ void ubifs_dump_sleb(const struct ubifs_info *c,
865{ 796{
866 struct ubifs_scan_node *snod; 797 struct ubifs_scan_node *snod;
867 798
868 printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n", 799 pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n",
869 current->pid, sleb->lnum, offs); 800 current->pid, sleb->lnum, offs);
870 801
871 list_for_each_entry(snod, &sleb->nodes, list) { 802 list_for_each_entry(snod, &sleb->nodes, list) {
872 cond_resched(); 803 cond_resched();
873 printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum, 804 pr_err("Dumping node at LEB %d:%d len %d\n",
874 snod->offs, snod->len); 805 sleb->lnum, snod->offs, snod->len);
875 ubifs_dump_node(c, snod->node); 806 ubifs_dump_node(c, snod->node);
876 } 807 }
877} 808}
@@ -882,11 +813,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
882 struct ubifs_scan_node *snod; 813 struct ubifs_scan_node *snod;
883 void *buf; 814 void *buf;
884 815
885 if (dbg_is_tst_rcvry(c)) 816 pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
886 return;
887
888 printk(KERN_ERR "(pid %d) start dumping LEB %d\n",
889 current->pid, lnum);
890 817
891 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); 818 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
892 if (!buf) { 819 if (!buf) {
@@ -900,18 +827,17 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
900 goto out; 827 goto out;
901 } 828 }
902 829
903 printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum, 830 pr_err("LEB %d has %d nodes ending at %d\n", lnum,
904 sleb->nodes_cnt, sleb->endpt); 831 sleb->nodes_cnt, sleb->endpt);
905 832
906 list_for_each_entry(snod, &sleb->nodes, list) { 833 list_for_each_entry(snod, &sleb->nodes, list) {
907 cond_resched(); 834 cond_resched();
908 printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum, 835 pr_err("Dumping node at LEB %d:%d len %d\n", lnum,
909 snod->offs, snod->len); 836 snod->offs, snod->len);
910 ubifs_dump_node(c, snod->node); 837 ubifs_dump_node(c, snod->node);
911 } 838 }
912 839
913 printk(KERN_ERR "(pid %d) finish dumping LEB %d\n", 840 pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
914 current->pid, lnum);
915 ubifs_scan_destroy(sleb); 841 ubifs_scan_destroy(sleb);
916 842
917out: 843out:
@@ -932,33 +858,28 @@ void ubifs_dump_znode(const struct ubifs_info *c,
932 else 858 else
933 zbr = &c->zroot; 859 zbr = &c->zroot;
934 860
935 printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d" 861 pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n",
936 " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, 862 znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip,
937 zbr->len, znode->parent, znode->iip, znode->level, 863 znode->level, znode->child_cnt, znode->flags);
938 znode->child_cnt, znode->flags);
939 864
940 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { 865 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
941 spin_unlock(&dbg_lock); 866 spin_unlock(&dbg_lock);
942 return; 867 return;
943 } 868 }
944 869
945 printk(KERN_ERR "zbranches:\n"); 870 pr_err("zbranches:\n");
946 for (n = 0; n < znode->child_cnt; n++) { 871 for (n = 0; n < znode->child_cnt; n++) {
947 zbr = &znode->zbranch[n]; 872 zbr = &znode->zbranch[n];
948 if (znode->level > 0) 873 if (znode->level > 0)
949 printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key " 874 pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n",
950 "%s\n", n, zbr->znode, zbr->lnum, 875 n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
951 zbr->offs, zbr->len, 876 dbg_snprintf_key(c, &zbr->key, key_buf,
952 dbg_snprintf_key(c, &zbr->key, 877 DBG_KEY_BUF_LEN));
953 key_buf,
954 DBG_KEY_BUF_LEN));
955 else 878 else
956 printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key " 879 pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n",
957 "%s\n", n, zbr->znode, zbr->lnum, 880 n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
958 zbr->offs, zbr->len, 881 dbg_snprintf_key(c, &zbr->key, key_buf,
959 dbg_snprintf_key(c, &zbr->key, 882 DBG_KEY_BUF_LEN));
960 key_buf,
961 DBG_KEY_BUF_LEN));
962 } 883 }
963 spin_unlock(&dbg_lock); 884 spin_unlock(&dbg_lock);
964} 885}
@@ -967,16 +888,16 @@ void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
967{ 888{
968 int i; 889 int i;
969 890
970 printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n", 891 pr_err("(pid %d) start dumping heap cat %d (%d elements)\n",
971 current->pid, cat, heap->cnt); 892 current->pid, cat, heap->cnt);
972 for (i = 0; i < heap->cnt; i++) { 893 for (i = 0; i < heap->cnt; i++) {
973 struct ubifs_lprops *lprops = heap->arr[i]; 894 struct ubifs_lprops *lprops = heap->arr[i];
974 895
975 printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d " 896 pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n",
976 "flags %d\n", i, lprops->lnum, lprops->hpos, 897 i, lprops->lnum, lprops->hpos, lprops->free,
977 lprops->free, lprops->dirty, lprops->flags); 898 lprops->dirty, lprops->flags);
978 } 899 }
979 printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid); 900 pr_err("(pid %d) finish dumping heap\n", current->pid);
980} 901}
981 902
982void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 903void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -984,15 +905,15 @@ void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
984{ 905{
985 int i; 906 int i;
986 907
987 printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid); 908 pr_err("(pid %d) dumping pnode:\n", current->pid);
988 printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n", 909 pr_err("\taddress %zx parent %zx cnext %zx\n",
989 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); 910 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
990 printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n", 911 pr_err("\tflags %lu iip %d level %d num %d\n",
991 pnode->flags, iip, pnode->level, pnode->num); 912 pnode->flags, iip, pnode->level, pnode->num);
992 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 913 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
993 struct ubifs_lprops *lp = &pnode->lprops[i]; 914 struct ubifs_lprops *lp = &pnode->lprops[i];
994 915
995 printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n", 916 pr_err("\t%d: free %d dirty %d flags %d lnum %d\n",
996 i, lp->free, lp->dirty, lp->flags, lp->lnum); 917 i, lp->free, lp->dirty, lp->flags, lp->lnum);
997 } 918 }
998} 919}
@@ -1002,20 +923,20 @@ void ubifs_dump_tnc(struct ubifs_info *c)
1002 struct ubifs_znode *znode; 923 struct ubifs_znode *znode;
1003 int level; 924 int level;
1004 925
1005 printk(KERN_ERR "\n"); 926 pr_err("\n");
1006 printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid); 927 pr_err("(pid %d) start dumping TNC tree\n", current->pid);
1007 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); 928 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
1008 level = znode->level; 929 level = znode->level;
1009 printk(KERN_ERR "== Level %d ==\n", level); 930 pr_err("== Level %d ==\n", level);
1010 while (znode) { 931 while (znode) {
1011 if (level != znode->level) { 932 if (level != znode->level) {
1012 level = znode->level; 933 level = znode->level;
1013 printk(KERN_ERR "== Level %d ==\n", level); 934 pr_err("== Level %d ==\n", level);
1014 } 935 }
1015 ubifs_dump_znode(c, znode); 936 ubifs_dump_znode(c, znode);
1016 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); 937 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
1017 } 938 }
1018 printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid); 939 pr_err("(pid %d) finish dumping TNC tree\n", current->pid);
1019} 940}
1020 941
1021static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, 942static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -1154,8 +1075,8 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode)
1154 mutex_lock(&ui->ui_mutex); 1075 mutex_lock(&ui->ui_mutex);
1155 spin_lock(&ui->ui_lock); 1076 spin_lock(&ui->ui_lock);
1156 if (ui->ui_size != ui->synced_i_size && !ui->dirty) { 1077 if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
1157 ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " 1078 ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean",
1158 "is clean", ui->ui_size, ui->synced_i_size); 1079 ui->ui_size, ui->synced_i_size);
1159 ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, 1080 ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
1160 inode->i_mode, i_size_read(inode)); 1081 inode->i_mode, i_size_read(inode));
1161 dump_stack(); 1082 dump_stack();
@@ -1217,17 +1138,16 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
1217 kfree(pdent); 1138 kfree(pdent);
1218 1139
1219 if (i_size_read(dir) != size) { 1140 if (i_size_read(dir) != size) {
1220 ubifs_err("directory inode %lu has size %llu, " 1141 ubifs_err("directory inode %lu has size %llu, but calculated size is %llu",
1221 "but calculated size is %llu", dir->i_ino, 1142 dir->i_ino, (unsigned long long)i_size_read(dir),
1222 (unsigned long long)i_size_read(dir),
1223 (unsigned long long)size); 1143 (unsigned long long)size);
1224 ubifs_dump_inode(c, dir); 1144 ubifs_dump_inode(c, dir);
1225 dump_stack(); 1145 dump_stack();
1226 return -EINVAL; 1146 return -EINVAL;
1227 } 1147 }
1228 if (dir->i_nlink != nlink) { 1148 if (dir->i_nlink != nlink) {
1229 ubifs_err("directory inode %lu has nlink %u, but calculated " 1149 ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u",
1230 "nlink is %u", dir->i_ino, dir->i_nlink, nlink); 1150 dir->i_ino, dir->i_nlink, nlink);
1231 ubifs_dump_inode(c, dir); 1151 ubifs_dump_inode(c, dir);
1232 dump_stack(); 1152 dump_stack();
1233 return -EINVAL; 1153 return -EINVAL;
@@ -1686,8 +1606,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
1686 if (znode_cb) { 1606 if (znode_cb) {
1687 err = znode_cb(c, znode, priv); 1607 err = znode_cb(c, znode, priv);
1688 if (err) { 1608 if (err) {
1689 ubifs_err("znode checking function returned " 1609 ubifs_err("znode checking function returned error %d",
1690 "error %d", err); 1610 err);
1691 ubifs_dump_znode(c, znode); 1611 ubifs_dump_znode(c, znode);
1692 goto out_dump; 1612 goto out_dump;
1693 } 1613 }
@@ -1697,9 +1617,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
1697 zbr = &znode->zbranch[idx]; 1617 zbr = &znode->zbranch[idx];
1698 err = leaf_cb(c, zbr, priv); 1618 err = leaf_cb(c, zbr, priv);
1699 if (err) { 1619 if (err) {
1700 ubifs_err("leaf checking function " 1620 ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d",
1701 "returned error %d, for leaf "
1702 "at LEB %d:%d",
1703 err, zbr->lnum, zbr->offs); 1621 err, zbr->lnum, zbr->offs);
1704 goto out_dump; 1622 goto out_dump;
1705 } 1623 }
@@ -1807,8 +1725,8 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
1807 } 1725 }
1808 1726
1809 if (calc != idx_size) { 1727 if (calc != idx_size) {
1810 ubifs_err("index size check failed: calculated size is %lld, " 1728 ubifs_err("index size check failed: calculated size is %lld, should be %lld",
1811 "should be %lld", calc, idx_size); 1729 calc, idx_size);
1812 dump_stack(); 1730 dump_stack();
1813 return -EINVAL; 1731 return -EINVAL;
1814 } 1732 }
@@ -2120,8 +2038,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2120 fscki = read_add_inode(c, priv, inum); 2038 fscki = read_add_inode(c, priv, inum);
2121 if (IS_ERR(fscki)) { 2039 if (IS_ERR(fscki)) {
2122 err = PTR_ERR(fscki); 2040 err = PTR_ERR(fscki);
2123 ubifs_err("error %d while processing data node and " 2041 ubifs_err("error %d while processing data node and trying to find inode node %lu",
2124 "trying to find inode node %lu",
2125 err, (unsigned long)inum); 2042 err, (unsigned long)inum);
2126 goto out_dump; 2043 goto out_dump;
2127 } 2044 }
@@ -2131,9 +2048,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2131 blk_offs <<= UBIFS_BLOCK_SHIFT; 2048 blk_offs <<= UBIFS_BLOCK_SHIFT;
2132 blk_offs += le32_to_cpu(dn->size); 2049 blk_offs += le32_to_cpu(dn->size);
2133 if (blk_offs > fscki->size) { 2050 if (blk_offs > fscki->size) {
2134 ubifs_err("data node at LEB %d:%d is not within inode " 2051 ubifs_err("data node at LEB %d:%d is not within inode size %lld",
2135 "size %lld", zbr->lnum, zbr->offs, 2052 zbr->lnum, zbr->offs, fscki->size);
2136 fscki->size);
2137 err = -EINVAL; 2053 err = -EINVAL;
2138 goto out_dump; 2054 goto out_dump;
2139 } 2055 }
@@ -2154,8 +2070,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2154 fscki = read_add_inode(c, priv, inum); 2070 fscki = read_add_inode(c, priv, inum);
2155 if (IS_ERR(fscki)) { 2071 if (IS_ERR(fscki)) {
2156 err = PTR_ERR(fscki); 2072 err = PTR_ERR(fscki);
2157 ubifs_err("error %d while processing entry node and " 2073 ubifs_err("error %d while processing entry node and trying to find inode node %lu",
2158 "trying to find inode node %lu",
2159 err, (unsigned long)inum); 2074 err, (unsigned long)inum);
2160 goto out_dump; 2075 goto out_dump;
2161 } 2076 }
@@ -2167,8 +2082,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2167 fscki1 = read_add_inode(c, priv, inum); 2082 fscki1 = read_add_inode(c, priv, inum);
2168 if (IS_ERR(fscki1)) { 2083 if (IS_ERR(fscki1)) {
2169 err = PTR_ERR(fscki1); 2084 err = PTR_ERR(fscki1);
2170 ubifs_err("error %d while processing entry node and " 2085 ubifs_err("error %d while processing entry node and trying to find parent inode node %lu",
2171 "trying to find parent inode node %lu",
2172 err, (unsigned long)inum); 2086 err, (unsigned long)inum);
2173 goto out_dump; 2087 goto out_dump;
2174 } 2088 }
@@ -2258,61 +2172,52 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
2258 */ 2172 */
2259 if (fscki->inum != UBIFS_ROOT_INO && 2173 if (fscki->inum != UBIFS_ROOT_INO &&
2260 fscki->references != 1) { 2174 fscki->references != 1) {
2261 ubifs_err("directory inode %lu has %d " 2175 ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1",
2262 "direntries which refer it, but "
2263 "should be 1",
2264 (unsigned long)fscki->inum, 2176 (unsigned long)fscki->inum,
2265 fscki->references); 2177 fscki->references);
2266 goto out_dump; 2178 goto out_dump;
2267 } 2179 }
2268 if (fscki->inum == UBIFS_ROOT_INO && 2180 if (fscki->inum == UBIFS_ROOT_INO &&
2269 fscki->references != 0) { 2181 fscki->references != 0) {
2270 ubifs_err("root inode %lu has non-zero (%d) " 2182 ubifs_err("root inode %lu has non-zero (%d) direntries which refer it",
2271 "direntries which refer it",
2272 (unsigned long)fscki->inum, 2183 (unsigned long)fscki->inum,
2273 fscki->references); 2184 fscki->references);
2274 goto out_dump; 2185 goto out_dump;
2275 } 2186 }
2276 if (fscki->calc_sz != fscki->size) { 2187 if (fscki->calc_sz != fscki->size) {
2277 ubifs_err("directory inode %lu size is %lld, " 2188 ubifs_err("directory inode %lu size is %lld, but calculated size is %lld",
2278 "but calculated size is %lld",
2279 (unsigned long)fscki->inum, 2189 (unsigned long)fscki->inum,
2280 fscki->size, fscki->calc_sz); 2190 fscki->size, fscki->calc_sz);
2281 goto out_dump; 2191 goto out_dump;
2282 } 2192 }
2283 if (fscki->calc_cnt != fscki->nlink) { 2193 if (fscki->calc_cnt != fscki->nlink) {
2284 ubifs_err("directory inode %lu nlink is %d, " 2194 ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d",
2285 "but calculated nlink is %d",
2286 (unsigned long)fscki->inum, 2195 (unsigned long)fscki->inum,
2287 fscki->nlink, fscki->calc_cnt); 2196 fscki->nlink, fscki->calc_cnt);
2288 goto out_dump; 2197 goto out_dump;
2289 } 2198 }
2290 } else { 2199 } else {
2291 if (fscki->references != fscki->nlink) { 2200 if (fscki->references != fscki->nlink) {
2292 ubifs_err("inode %lu nlink is %d, but " 2201 ubifs_err("inode %lu nlink is %d, but calculated nlink is %d",
2293 "calculated nlink is %d",
2294 (unsigned long)fscki->inum, 2202 (unsigned long)fscki->inum,
2295 fscki->nlink, fscki->references); 2203 fscki->nlink, fscki->references);
2296 goto out_dump; 2204 goto out_dump;
2297 } 2205 }
2298 } 2206 }
2299 if (fscki->xattr_sz != fscki->calc_xsz) { 2207 if (fscki->xattr_sz != fscki->calc_xsz) {
2300 ubifs_err("inode %lu has xattr size %u, but " 2208 ubifs_err("inode %lu has xattr size %u, but calculated size is %lld",
2301 "calculated size is %lld",
2302 (unsigned long)fscki->inum, fscki->xattr_sz, 2209 (unsigned long)fscki->inum, fscki->xattr_sz,
2303 fscki->calc_xsz); 2210 fscki->calc_xsz);
2304 goto out_dump; 2211 goto out_dump;
2305 } 2212 }
2306 if (fscki->xattr_cnt != fscki->calc_xcnt) { 2213 if (fscki->xattr_cnt != fscki->calc_xcnt) {
2307 ubifs_err("inode %lu has %u xattrs, but " 2214 ubifs_err("inode %lu has %u xattrs, but calculated count is %lld",
2308 "calculated count is %lld",
2309 (unsigned long)fscki->inum, 2215 (unsigned long)fscki->inum,
2310 fscki->xattr_cnt, fscki->calc_xcnt); 2216 fscki->xattr_cnt, fscki->calc_xcnt);
2311 goto out_dump; 2217 goto out_dump;
2312 } 2218 }
2313 if (fscki->xattr_nms != fscki->calc_xnms) { 2219 if (fscki->xattr_nms != fscki->calc_xnms) {
2314 ubifs_err("inode %lu has xattr names' size %u, but " 2220 ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld",
2315 "calculated names' size is %lld",
2316 (unsigned long)fscki->inum, fscki->xattr_nms, 2221 (unsigned long)fscki->inum, fscki->xattr_nms,
2317 fscki->calc_xnms); 2222 fscki->calc_xnms);
2318 goto out_dump; 2223 goto out_dump;
@@ -2652,20 +2557,18 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
2652 return 1; 2557 return 1;
2653} 2558}
2654 2559
2655static void cut_data(const void *buf, unsigned int len) 2560static int corrupt_data(const struct ubifs_info *c, const void *buf,
2561 unsigned int len)
2656{ 2562{
2657 unsigned int from, to, i, ffs = chance(1, 2); 2563 unsigned int from, to, i, ffs = chance(1, 2);
2658 unsigned char *p = (void *)buf; 2564 unsigned char *p = (void *)buf;
2659 2565
2660 from = random32() % (len + 1); 2566 from = random32() % (len + 1);
2661 if (chance(1, 2)) 2567 /* Corruption may only span one max. write unit */
2662 to = random32() % (len - from + 1); 2568 to = min(len, ALIGN(from, c->max_write_size));
2663 else
2664 to = len;
2665 2569
2666 if (from < to) 2570 ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
2667 ubifs_warn("filled bytes %u-%u with %s", from, to - 1, 2571 ffs ? "0xFFs" : "random data");
2668 ffs ? "0xFFs" : "random data");
2669 2572
2670 if (ffs) 2573 if (ffs)
2671 for (i = from; i < to; i++) 2574 for (i = from; i < to; i++)
@@ -2673,6 +2576,8 @@ static void cut_data(const void *buf, unsigned int len)
2673 else 2576 else
2674 for (i = from; i < to; i++) 2577 for (i = from; i < to; i++)
2675 p[i] = random32() % 0x100; 2578 p[i] = random32() % 0x100;
2579
2580 return to;
2676} 2581}
2677 2582
2678int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, 2583int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
@@ -2685,7 +2590,9 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
2685 2590
2686 failing = power_cut_emulated(c, lnum, 1); 2591 failing = power_cut_emulated(c, lnum, 1);
2687 if (failing) 2592 if (failing)
2688 cut_data(buf, len); 2593 len = corrupt_data(c, buf, len);
2594 ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
2595 len, lnum, offs);
2689 err = ubi_leb_write(c->ubi, lnum, buf, offs, len); 2596 err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
2690 if (err) 2597 if (err)
2691 return err; 2598 return err;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 760de723dadb..e03d5179769a 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -150,7 +150,7 @@ struct ubifs_global_debug_info {
150 150
151#define ubifs_assert(expr) do { \ 151#define ubifs_assert(expr) do { \
152 if (unlikely(!(expr))) { \ 152 if (unlikely(!(expr))) { \
153 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ 153 pr_crit("UBIFS assert failed in %s at %u (pid %d)\n", \
154 __func__, __LINE__, current->pid); \ 154 __func__, __LINE__, current->pid); \
155 dump_stack(); \ 155 dump_stack(); \
156 } \ 156 } \
@@ -159,26 +159,23 @@ struct ubifs_global_debug_info {
159#define ubifs_assert_cmt_locked(c) do { \ 159#define ubifs_assert_cmt_locked(c) do { \
160 if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ 160 if (unlikely(down_write_trylock(&(c)->commit_sem))) { \
161 up_write(&(c)->commit_sem); \ 161 up_write(&(c)->commit_sem); \
162 printk(KERN_CRIT "commit lock is not locked!\n"); \ 162 pr_crit("commit lock is not locked!\n"); \
163 ubifs_assert(0); \ 163 ubifs_assert(0); \
164 } \ 164 } \
165} while (0) 165} while (0)
166 166
167#define ubifs_dbg_msg(type, fmt, ...) \ 167#define ubifs_dbg_msg(type, fmt, ...) \
168 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) 168 pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid, \
169 ##__VA_ARGS__)
169 170
170#define DBG_KEY_BUF_LEN 48 171#define DBG_KEY_BUF_LEN 48
171#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ 172#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
172 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ 173 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
173 pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ 174 pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid, \
175 ##__VA_ARGS__, \
174 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ 176 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
175} while (0) 177} while (0)
176 178
177/* Just a debugging messages not related to any specific UBIFS subsystem */
178#define dbg_msg(fmt, ...) \
179 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
180 __func__, ##__VA_ARGS__)
181
182/* General messages */ 179/* General messages */
183#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) 180#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
184/* Additional journal messages */ 181/* Additional journal messages */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c95681cf1b71..e271fba1651b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -980,8 +980,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
980 * separately. 980 * separately.
981 */ 981 */
982 982
983 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " 983 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu",
984 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, 984 old_dentry->d_name.len, old_dentry->d_name.name,
985 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 985 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
986 new_dentry->d_name.name, new_dir->i_ino); 986 new_dentry->d_name.name, new_dir->i_ino);
987 ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); 987 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 7bd6e72afd11..5bc77817f382 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1486,8 +1486,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
1486 err = ubifs_budget_space(c, &req); 1486 err = ubifs_budget_space(c, &req);
1487 if (unlikely(err)) { 1487 if (unlikely(err)) {
1488 if (err == -ENOSPC) 1488 if (err == -ENOSPC)
1489 ubifs_warn("out of space for mmapped file " 1489 ubifs_warn("out of space for mmapped file (inode number %lu)",
1490 "(inode number %lu)", inode->i_ino); 1490 inode->i_ino);
1491 return VM_FAULT_SIGBUS; 1491 return VM_FAULT_SIGBUS;
1492 } 1492 }
1493 1493
@@ -1536,6 +1536,7 @@ out_unlock:
1536static const struct vm_operations_struct ubifs_file_vm_ops = { 1536static const struct vm_operations_struct ubifs_file_vm_ops = {
1537 .fault = filemap_fault, 1537 .fault = filemap_fault,
1538 .page_mkwrite = ubifs_vm_page_mkwrite, 1538 .page_mkwrite = ubifs_vm_page_mkwrite,
1539 .remap_pages = generic_file_remap_pages,
1539}; 1540};
1540 1541
1541static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1542static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 04dd6f47635e..76ca53cd3eee 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -714,9 +714,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
714 break; 714 break;
715 } 715 }
716 716
717 dbg_gc("found LEB %d: free %d, dirty %d, sum %d " 717 dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
718 "(min. space %d)", lp.lnum, lp.free, lp.dirty, 718 lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
719 lp.free + lp.dirty, min_space); 719 min_space);
720 720
721 space_before = c->leb_size - wbuf->offs - wbuf->used; 721 space_before = c->leb_size - wbuf->offs - wbuf->used;
722 if (wbuf->lnum == -1) 722 if (wbuf->lnum == -1)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 12c0f154ca83..afaad07f3b29 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -469,8 +469,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
469 ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 469 ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
470 ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); 470 ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
471 ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 471 ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
472 ino->uid = cpu_to_le32(inode->i_uid); 472 ino->uid = cpu_to_le32(i_uid_read(inode));
473 ino->gid = cpu_to_le32(inode->i_gid); 473 ino->gid = cpu_to_le32(i_gid_read(inode));
474 ino->mode = cpu_to_le32(inode->i_mode); 474 ino->mode = cpu_to_le32(inode->i_mode);
475 ino->flags = cpu_to_le32(ui->flags); 475 ino->flags = cpu_to_le32(ui->flags);
476 ino->size = cpu_to_le64(ui->ui_size); 476 ino->size = cpu_to_le64(ui->ui_size);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c80b15d6c8de..36bd4efd0819 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -315,17 +315,15 @@ static void remove_buds(struct ubifs_info *c)
315 * heads (non-closed buds). 315 * heads (non-closed buds).
316 */ 316 */
317 c->cmt_bud_bytes += wbuf->offs - bud->start; 317 c->cmt_bud_bytes += wbuf->offs - bud->start;
318 dbg_log("preserve %d:%d, jhead %s, bud bytes %d, " 318 dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
319 "cmt_bud_bytes %lld", bud->lnum, bud->start, 319 bud->lnum, bud->start, dbg_jhead(bud->jhead),
320 dbg_jhead(bud->jhead), wbuf->offs - bud->start, 320 wbuf->offs - bud->start, c->cmt_bud_bytes);
321 c->cmt_bud_bytes);
322 bud->start = wbuf->offs; 321 bud->start = wbuf->offs;
323 } else { 322 } else {
324 c->cmt_bud_bytes += c->leb_size - bud->start; 323 c->cmt_bud_bytes += c->leb_size - bud->start;
325 dbg_log("remove %d:%d, jhead %s, bud bytes %d, " 324 dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
326 "cmt_bud_bytes %lld", bud->lnum, bud->start, 325 bud->lnum, bud->start, dbg_jhead(bud->jhead),
327 dbg_jhead(bud->jhead), c->leb_size - bud->start, 326 c->leb_size - bud->start, c->cmt_bud_bytes);
328 c->cmt_bud_bytes);
329 rb_erase(p1, &c->buds); 327 rb_erase(p1, &c->buds);
330 /* 328 /*
331 * If the commit does not finish, the recovery will need 329 * If the commit does not finish, the recovery will need
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 86eb8e533249..e5a2a35a46dc 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -867,15 +867,15 @@ int dbg_check_cats(struct ubifs_info *c)
867 867
868 list_for_each_entry(lprops, &c->empty_list, list) { 868 list_for_each_entry(lprops, &c->empty_list, list) {
869 if (lprops->free != c->leb_size) { 869 if (lprops->free != c->leb_size) {
870 ubifs_err("non-empty LEB %d on empty list " 870 ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)",
871 "(free %d dirty %d flags %d)", lprops->lnum, 871 lprops->lnum, lprops->free, lprops->dirty,
872 lprops->free, lprops->dirty, lprops->flags); 872 lprops->flags);
873 return -EINVAL; 873 return -EINVAL;
874 } 874 }
875 if (lprops->flags & LPROPS_TAKEN) { 875 if (lprops->flags & LPROPS_TAKEN) {
876 ubifs_err("taken LEB %d on empty list " 876 ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)",
877 "(free %d dirty %d flags %d)", lprops->lnum, 877 lprops->lnum, lprops->free, lprops->dirty,
878 lprops->free, lprops->dirty, lprops->flags); 878 lprops->flags);
879 return -EINVAL; 879 return -EINVAL;
880 } 880 }
881 } 881 }
@@ -883,15 +883,15 @@ int dbg_check_cats(struct ubifs_info *c)
883 i = 0; 883 i = 0;
884 list_for_each_entry(lprops, &c->freeable_list, list) { 884 list_for_each_entry(lprops, &c->freeable_list, list) {
885 if (lprops->free + lprops->dirty != c->leb_size) { 885 if (lprops->free + lprops->dirty != c->leb_size) {
886 ubifs_err("non-freeable LEB %d on freeable list " 886 ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)",
887 "(free %d dirty %d flags %d)", lprops->lnum, 887 lprops->lnum, lprops->free, lprops->dirty,
888 lprops->free, lprops->dirty, lprops->flags); 888 lprops->flags);
889 return -EINVAL; 889 return -EINVAL;
890 } 890 }
891 if (lprops->flags & LPROPS_TAKEN) { 891 if (lprops->flags & LPROPS_TAKEN) {
892 ubifs_err("taken LEB %d on freeable list " 892 ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)",
893 "(free %d dirty %d flags %d)", lprops->lnum, 893 lprops->lnum, lprops->free, lprops->dirty,
894 lprops->free, lprops->dirty, lprops->flags); 894 lprops->flags);
895 return -EINVAL; 895 return -EINVAL;
896 } 896 }
897 i += 1; 897 i += 1;
@@ -913,21 +913,21 @@ int dbg_check_cats(struct ubifs_info *c)
913 913
914 list_for_each_entry(lprops, &c->frdi_idx_list, list) { 914 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
915 if (lprops->free + lprops->dirty != c->leb_size) { 915 if (lprops->free + lprops->dirty != c->leb_size) {
916 ubifs_err("non-freeable LEB %d on frdi_idx list " 916 ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)",
917 "(free %d dirty %d flags %d)", lprops->lnum, 917 lprops->lnum, lprops->free, lprops->dirty,
918 lprops->free, lprops->dirty, lprops->flags); 918 lprops->flags);
919 return -EINVAL; 919 return -EINVAL;
920 } 920 }
921 if (lprops->flags & LPROPS_TAKEN) { 921 if (lprops->flags & LPROPS_TAKEN) {
922 ubifs_err("taken LEB %d on frdi_idx list " 922 ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)",
923 "(free %d dirty %d flags %d)", lprops->lnum, 923 lprops->lnum, lprops->free, lprops->dirty,
924 lprops->free, lprops->dirty, lprops->flags); 924 lprops->flags);
925 return -EINVAL; 925 return -EINVAL;
926 } 926 }
927 if (!(lprops->flags & LPROPS_INDEX)) { 927 if (!(lprops->flags & LPROPS_INDEX)) {
928 ubifs_err("non-index LEB %d on frdi_idx list " 928 ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)",
929 "(free %d dirty %d flags %d)", lprops->lnum, 929 lprops->lnum, lprops->free, lprops->dirty,
930 lprops->free, lprops->dirty, lprops->flags); 930 lprops->flags);
931 return -EINVAL; 931 return -EINVAL;
932 } 932 }
933 } 933 }
@@ -982,9 +982,9 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
982 goto out; 982 goto out;
983 } 983 }
984 if (lprops != lp) { 984 if (lprops != lp) {
985 dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", 985 ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
986 (size_t)lprops, (size_t)lp, lprops->lnum, 986 (size_t)lprops, (size_t)lp, lprops->lnum,
987 lp->lnum); 987 lp->lnum);
988 err = 4; 988 err = 4;
989 goto out; 989 goto out;
990 } 990 }
@@ -1002,7 +1002,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
1002 } 1002 }
1003out: 1003out:
1004 if (err) { 1004 if (err) {
1005 dbg_msg("failed cat %d hpos %d err %d", cat, i, err); 1005 ubifs_err("failed cat %d hpos %d err %d", cat, i, err);
1006 dump_stack(); 1006 dump_stack();
1007 ubifs_dump_heap(c, heap, cat); 1007 ubifs_dump_heap(c, heap, cat);
1008 } 1008 }
@@ -1153,8 +1153,8 @@ static int scan_check_cb(struct ubifs_info *c,
1153 1153
1154 if (free > c->leb_size || free < 0 || dirty > c->leb_size || 1154 if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
1155 dirty < 0) { 1155 dirty < 0) {
1156 ubifs_err("bad calculated accounting for LEB %d: " 1156 ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d",
1157 "free %d, dirty %d", lnum, free, dirty); 1157 lnum, free, dirty);
1158 goto out_destroy; 1158 goto out_destroy;
1159 } 1159 }
1160 1160
@@ -1200,8 +1200,7 @@ static int scan_check_cb(struct ubifs_info *c,
1200 /* Free but not unmapped LEB, it's fine */ 1200 /* Free but not unmapped LEB, it's fine */
1201 is_idx = 0; 1201 is_idx = 0;
1202 else { 1202 else {
1203 ubifs_err("indexing node without indexing " 1203 ubifs_err("indexing node without indexing flag");
1204 "flag");
1205 goto out_print; 1204 goto out_print;
1206 } 1205 }
1207 } 1206 }
@@ -1236,8 +1235,7 @@ static int scan_check_cb(struct ubifs_info *c,
1236 return LPT_SCAN_CONTINUE; 1235 return LPT_SCAN_CONTINUE;
1237 1236
1238out_print: 1237out_print:
1239 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " 1238 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d",
1240 "should be free %d, dirty %d",
1241 lnum, lp->free, lp->dirty, lp->flags, free, dirty); 1239 lnum, lp->free, lp->dirty, lp->flags, free, dirty);
1242 ubifs_dump_leb(c, lnum); 1240 ubifs_dump_leb(c, lnum);
1243out_destroy: 1241out_destroy:
@@ -1290,12 +1288,10 @@ int dbg_check_lprops(struct ubifs_info *c)
1290 lst.total_dirty != c->lst.total_dirty || 1288 lst.total_dirty != c->lst.total_dirty ||
1291 lst.total_used != c->lst.total_used) { 1289 lst.total_used != c->lst.total_used) {
1292 ubifs_err("bad overall accounting"); 1290 ubifs_err("bad overall accounting");
1293 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " 1291 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
1294 "total_free %lld, total_dirty %lld, total_used %lld",
1295 lst.empty_lebs, lst.idx_lebs, lst.total_free, 1292 lst.empty_lebs, lst.idx_lebs, lst.total_free,
1296 lst.total_dirty, lst.total_used); 1293 lst.total_dirty, lst.total_used);
1297 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " 1294 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
1298 "total_free %lld, total_dirty %lld, total_used %lld",
1299 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, 1295 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
1300 c->lst.total_dirty, c->lst.total_used); 1296 c->lst.total_dirty, c->lst.total_used);
1301 err = -EINVAL; 1297 err = -EINVAL;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 8640920766ed..d46b19ec1815 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1311,7 +1311,7 @@ out:
1311 ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); 1311 ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
1312 ubifs_dump_pnode(c, pnode, parent, iip); 1312 ubifs_dump_pnode(c, pnode, parent, iip);
1313 dump_stack(); 1313 dump_stack();
1314 dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); 1314 ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
1315 kfree(pnode); 1315 kfree(pnode);
1316 return err; 1316 return err;
1317} 1317}
@@ -2237,8 +2237,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
2237 /* cnode is a nnode */ 2237 /* cnode is a nnode */
2238 num = calc_nnode_num(row, col); 2238 num = calc_nnode_num(row, col);
2239 if (cnode->num != num) { 2239 if (cnode->num != num) {
2240 ubifs_err("nnode num %d expected %d " 2240 ubifs_err("nnode num %d expected %d parent num %d iip %d",
2241 "parent num %d iip %d",
2242 cnode->num, num, 2241 cnode->num, num,
2243 (nnode ? nnode->num : 0), cnode->iip); 2242 (nnode ? nnode->num : 0), cnode->iip);
2244 return -EINVAL; 2243 return -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 4fa70734e6e7..9daaeef675dd 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,8 +320,8 @@ static int layout_cnodes(struct ubifs_info *c)
320 return 0; 320 return 0;
321 321
322no_space: 322no_space:
323 ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " 323 ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
324 "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 324 lnum, offs, len, done_ltab, done_lsave);
325 ubifs_dump_lpt_info(c); 325 ubifs_dump_lpt_info(c);
326 ubifs_dump_lpt_lebs(c); 326 ubifs_dump_lpt_lebs(c);
327 dump_stack(); 327 dump_stack();
@@ -545,8 +545,8 @@ static int write_cnodes(struct ubifs_info *c)
545 return 0; 545 return 0;
546 546
547no_space: 547no_space:
548 ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " 548 ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
549 "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 549 lnum, offs, len, done_ltab, done_lsave);
550 ubifs_dump_lpt_info(c); 550 ubifs_dump_lpt_info(c);
551 ubifs_dump_lpt_lebs(c); 551 ubifs_dump_lpt_lebs(c);
552 dump_stack(); 552 dump_stack();
@@ -1662,21 +1662,19 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1662 continue; 1662 continue;
1663 } 1663 }
1664 if (!dbg_is_all_ff(p, len)) { 1664 if (!dbg_is_all_ff(p, len)) {
1665 dbg_msg("invalid empty space in LEB %d at %d", 1665 ubifs_err("invalid empty space in LEB %d at %d",
1666 lnum, c->leb_size - len); 1666 lnum, c->leb_size - len);
1667 err = -EINVAL; 1667 err = -EINVAL;
1668 } 1668 }
1669 i = lnum - c->lpt_first; 1669 i = lnum - c->lpt_first;
1670 if (len != c->ltab[i].free) { 1670 if (len != c->ltab[i].free) {
1671 dbg_msg("invalid free space in LEB %d " 1671 ubifs_err("invalid free space in LEB %d (free %d, expected %d)",
1672 "(free %d, expected %d)", 1672 lnum, len, c->ltab[i].free);
1673 lnum, len, c->ltab[i].free);
1674 err = -EINVAL; 1673 err = -EINVAL;
1675 } 1674 }
1676 if (dirty != c->ltab[i].dirty) { 1675 if (dirty != c->ltab[i].dirty) {
1677 dbg_msg("invalid dirty space in LEB %d " 1676 ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)",
1678 "(dirty %d, expected %d)", 1677 lnum, dirty, c->ltab[i].dirty);
1679 lnum, dirty, c->ltab[i].dirty);
1680 err = -EINVAL; 1678 err = -EINVAL;
1681 } 1679 }
1682 goto out; 1680 goto out;
@@ -1888,8 +1886,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1888 int err, len = c->leb_size, node_type, node_num, node_len, offs; 1886 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1889 void *buf, *p; 1887 void *buf, *p;
1890 1888
1891 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 1889 pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
1892 current->pid, lnum);
1893 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); 1890 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1894 if (!buf) { 1891 if (!buf) {
1895 ubifs_err("cannot allocate memory to dump LPT"); 1892 ubifs_err("cannot allocate memory to dump LPT");
@@ -1907,14 +1904,14 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1907 1904
1908 pad_len = get_pad_len(c, p, len); 1905 pad_len = get_pad_len(c, p, len);
1909 if (pad_len) { 1906 if (pad_len) {
1910 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", 1907 pr_err("LEB %d:%d, pad %d bytes\n",
1911 lnum, offs, pad_len); 1908 lnum, offs, pad_len);
1912 p += pad_len; 1909 p += pad_len;
1913 len -= pad_len; 1910 len -= pad_len;
1914 continue; 1911 continue;
1915 } 1912 }
1916 if (len) 1913 if (len)
1917 printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n", 1914 pr_err("LEB %d:%d, free %d bytes\n",
1918 lnum, offs, len); 1915 lnum, offs, len);
1919 break; 1916 break;
1920 } 1917 }
@@ -1925,11 +1922,10 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1925 { 1922 {
1926 node_len = c->pnode_sz; 1923 node_len = c->pnode_sz;
1927 if (c->big_lpt) 1924 if (c->big_lpt)
1928 printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n", 1925 pr_err("LEB %d:%d, pnode num %d\n",
1929 lnum, offs, node_num); 1926 lnum, offs, node_num);
1930 else 1927 else
1931 printk(KERN_DEBUG "LEB %d:%d, pnode\n", 1928 pr_err("LEB %d:%d, pnode\n", lnum, offs);
1932 lnum, offs);
1933 break; 1929 break;
1934 } 1930 }
1935 case UBIFS_LPT_NNODE: 1931 case UBIFS_LPT_NNODE:
@@ -1939,29 +1935,28 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1939 1935
1940 node_len = c->nnode_sz; 1936 node_len = c->nnode_sz;
1941 if (c->big_lpt) 1937 if (c->big_lpt)
1942 printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ", 1938 pr_err("LEB %d:%d, nnode num %d, ",
1943 lnum, offs, node_num); 1939 lnum, offs, node_num);
1944 else 1940 else
1945 printk(KERN_DEBUG "LEB %d:%d, nnode, ", 1941 pr_err("LEB %d:%d, nnode, ",
1946 lnum, offs); 1942 lnum, offs);
1947 err = ubifs_unpack_nnode(c, p, &nnode); 1943 err = ubifs_unpack_nnode(c, p, &nnode);
1948 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1944 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1949 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, 1945 pr_cont("%d:%d", nnode.nbranch[i].lnum,
1950 nnode.nbranch[i].offs); 1946 nnode.nbranch[i].offs);
1951 if (i != UBIFS_LPT_FANOUT - 1) 1947 if (i != UBIFS_LPT_FANOUT - 1)
1952 printk(KERN_CONT ", "); 1948 pr_cont(", ");
1953 } 1949 }
1954 printk(KERN_CONT "\n"); 1950 pr_cont("\n");
1955 break; 1951 break;
1956 } 1952 }
1957 case UBIFS_LPT_LTAB: 1953 case UBIFS_LPT_LTAB:
1958 node_len = c->ltab_sz; 1954 node_len = c->ltab_sz;
1959 printk(KERN_DEBUG "LEB %d:%d, ltab\n", 1955 pr_err("LEB %d:%d, ltab\n", lnum, offs);
1960 lnum, offs);
1961 break; 1956 break;
1962 case UBIFS_LPT_LSAVE: 1957 case UBIFS_LPT_LSAVE:
1963 node_len = c->lsave_sz; 1958 node_len = c->lsave_sz;
1964 printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs); 1959 pr_err("LEB %d:%d, lsave len\n", lnum, offs);
1965 break; 1960 break;
1966 default: 1961 default:
1967 ubifs_err("LPT node type %d not recognized", node_type); 1962 ubifs_err("LPT node type %d not recognized", node_type);
@@ -1972,8 +1967,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1972 len -= node_len; 1967 len -= node_len;
1973 } 1968 }
1974 1969
1975 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 1970 pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
1976 current->pid, lnum);
1977out: 1971out:
1978 vfree(buf); 1972 vfree(buf);
1979 return; 1973 return;
@@ -1990,12 +1984,10 @@ void ubifs_dump_lpt_lebs(const struct ubifs_info *c)
1990{ 1984{
1991 int i; 1985 int i;
1992 1986
1993 printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n", 1987 pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid);
1994 current->pid);
1995 for (i = 0; i < c->lpt_lebs; i++) 1988 for (i = 0; i < c->lpt_lebs; i++)
1996 dump_lpt_leb(c, i + c->lpt_first); 1989 dump_lpt_leb(c, i + c->lpt_first);
1997 printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n", 1990 pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid);
1998 current->pid);
1999} 1991}
2000 1992
2001/** 1993/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index cebf17ea0458..769701ccb5c9 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -562,8 +562,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
562 562
563 list_for_each_entry(snod, &sleb->nodes, list) { 563 list_for_each_entry(snod, &sleb->nodes, list) {
564 if (snod->type != UBIFS_ORPH_NODE) { 564 if (snod->type != UBIFS_ORPH_NODE) {
565 ubifs_err("invalid node type %d in orphan area at " 565 ubifs_err("invalid node type %d in orphan area at %d:%d",
566 "%d:%d", snod->type, sleb->lnum, snod->offs); 566 snod->type, sleb->lnum, snod->offs);
567 ubifs_dump_node(c, snod->node); 567 ubifs_dump_node(c, snod->node);
568 return -EINVAL; 568 return -EINVAL;
569 } 569 }
@@ -589,8 +589,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
589 * number. That makes this orphan node, out of date. 589 * number. That makes this orphan node, out of date.
590 */ 590 */
591 if (!first) { 591 if (!first) {
592 ubifs_err("out of order commit number %llu in " 592 ubifs_err("out of order commit number %llu in orphan node at %d:%d",
593 "orphan node at %d:%d",
594 cmt_no, sleb->lnum, snod->offs); 593 cmt_no, sleb->lnum, snod->offs);
595 ubifs_dump_node(c, snod->node); 594 ubifs_dump_node(c, snod->node);
596 return -EINVAL; 595 return -EINVAL;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index edeec499c048..065096e36ed9 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -609,7 +609,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
609 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, 609 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
610 list); 610 list);
611 611
612 dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs); 612 dbg_rcvry("dropping last node at %d:%d",
613 sleb->lnum, snod->offs);
613 *offs = snod->offs; 614 *offs = snod->offs;
614 list_del(&snod->list); 615 list_del(&snod->list);
615 kfree(snod); 616 kfree(snod);
@@ -702,8 +703,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
702 * See header comment for this file for more 703 * See header comment for this file for more
703 * explanations about the reasons we have this check. 704 * explanations about the reasons we have this check.
704 */ 705 */
705 ubifs_err("corrupt empty space LEB %d:%d, corruption " 706 ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d",
706 "starts at %d", lnum, offs, corruption); 707 lnum, offs, corruption);
707 /* Make sure we dump interesting non-0xFF data */ 708 /* Make sure we dump interesting non-0xFF data */
708 offs += corruption; 709 offs += corruption;
709 buf += corruption; 710 buf += corruption;
@@ -899,8 +900,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
899 } 900 }
900 } 901 }
901 if (snod->sqnum > cs_sqnum) { 902 if (snod->sqnum > cs_sqnum) {
902 ubifs_err("unrecoverable log corruption " 903 ubifs_err("unrecoverable log corruption in LEB %d",
903 "in LEB %d", lnum); 904 lnum);
904 ubifs_scan_destroy(sleb); 905 ubifs_scan_destroy(sleb);
905 return ERR_PTR(-EUCLEAN); 906 return ERR_PTR(-EUCLEAN);
906 } 907 }
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 94d78fc5d4e0..3187925e9879 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -141,9 +141,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
141 * during the replay. 141 * during the replay.
142 */ 142 */
143 if (dirty != 0) 143 if (dirty != 0)
144 dbg_msg("LEB %d lp: %d free %d dirty " 144 dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty",
145 "replay: %d free %d dirty", b->bud->lnum, 145 b->bud->lnum, lp->free, lp->dirty, b->free,
146 lp->free, lp->dirty, b->free, b->dirty); 146 b->dirty);
147 } 147 }
148 lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, 148 lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
149 lp->flags | LPROPS_TAKEN, 0); 149 lp->flags | LPROPS_TAKEN, 0);
@@ -677,7 +677,8 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
677 677
678 b->dirty = sleb->endpt - offs - used; 678 b->dirty = sleb->endpt - offs - used;
679 b->free = c->leb_size - sleb->endpt; 679 b->free = c->leb_size - sleb->endpt;
680 dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free); 680 dbg_mnt("bud LEB %d replied: dirty %d, free %d",
681 lnum, b->dirty, b->free);
681 682
682out: 683out:
683 ubifs_scan_destroy(sleb); 684 ubifs_scan_destroy(sleb);
@@ -865,8 +866,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
865 goto out_dump; 866 goto out_dump;
866 } 867 }
867 if (le64_to_cpu(node->cmt_no) != c->cmt_no) { 868 if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
868 ubifs_err("first CS node at LEB %d:%d has wrong " 869 ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu",
869 "commit number %llu expected %llu",
870 lnum, offs, 870 lnum, offs,
871 (unsigned long long)le64_to_cpu(node->cmt_no), 871 (unsigned long long)le64_to_cpu(node->cmt_no),
872 c->cmt_no); 872 c->cmt_no);
@@ -1058,8 +1058,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
1058 c->bi.uncommitted_idx *= c->max_idx_node_sz; 1058 c->bi.uncommitted_idx *= c->max_idx_node_sz;
1059 1059
1060 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); 1060 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1061 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " 1061 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu",
1062 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, 1062 c->lhead_lnum, c->lhead_offs, c->max_sqnum,
1063 (unsigned long)c->highest_inum); 1063 (unsigned long)c->highest_inum);
1064out: 1064out:
1065 destroy_replay_list(c); 1065 destroy_replay_list(c);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 15e2fc5aa60b..4c37607a958e 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -391,9 +391,8 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
391 min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; 391 min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
392 392
393 if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { 393 if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
394 ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " 394 ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required",
395 "%d minimum required", c->leb_cnt, c->vi.size, 395 c->leb_cnt, c->vi.size, min_leb_cnt);
396 min_leb_cnt);
397 goto failed; 396 goto failed;
398 } 397 }
399 398
@@ -411,15 +410,14 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
411 410
412 max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; 411 max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
413 if (c->max_bud_bytes < max_bytes) { 412 if (c->max_bud_bytes < max_bytes) {
414 ubifs_err("too small journal (%lld bytes), must be at least " 413 ubifs_err("too small journal (%lld bytes), must be at least %lld bytes",
415 "%lld bytes", c->max_bud_bytes, max_bytes); 414 c->max_bud_bytes, max_bytes);
416 goto failed; 415 goto failed;
417 } 416 }
418 417
419 max_bytes = (long long)c->leb_size * c->main_lebs; 418 max_bytes = (long long)c->leb_size * c->main_lebs;
420 if (c->max_bud_bytes > max_bytes) { 419 if (c->max_bud_bytes > max_bytes) {
421 ubifs_err("too large journal size (%lld bytes), only %lld bytes" 420 ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area",
422 "available in the main area",
423 c->max_bud_bytes, max_bytes); 421 c->max_bud_bytes, max_bytes);
424 goto failed; 422 goto failed;
425 } 423 }
@@ -549,10 +547,9 @@ int ubifs_read_superblock(struct ubifs_info *c)
549 ubifs_assert(!c->ro_media || c->ro_mount); 547 ubifs_assert(!c->ro_media || c->ro_mount);
550 if (!c->ro_mount || 548 if (!c->ro_mount ||
551 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
552 ubifs_err("on-flash format version is w%d/r%d, but " 550 ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
553 "software only supports up to version " 551 c->fmt_version, c->ro_compat_version,
554 "w%d/r%d", c->fmt_version, 552 UBIFS_FORMAT_VERSION,
555 c->ro_compat_version, UBIFS_FORMAT_VERSION,
556 UBIFS_RO_COMPAT_VERSION); 553 UBIFS_RO_COMPAT_VERSION);
557 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { 554 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
558 ubifs_msg("only R/O mounting is possible"); 555 ubifs_msg("only R/O mounting is possible");
@@ -611,8 +608,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
611 c->fanout = le32_to_cpu(sup->fanout); 608 c->fanout = le32_to_cpu(sup->fanout);
612 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); 609 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
613 c->rp_size = le64_to_cpu(sup->rp_size); 610 c->rp_size = le64_to_cpu(sup->rp_size);
614 c->rp_uid = le32_to_cpu(sup->rp_uid); 611 c->rp_uid = make_kuid(&init_user_ns, le32_to_cpu(sup->rp_uid));
615 c->rp_gid = le32_to_cpu(sup->rp_gid); 612 c->rp_gid = make_kgid(&init_user_ns, le32_to_cpu(sup->rp_gid));
616 sup_flags = le32_to_cpu(sup->flags); 613 sup_flags = le32_to_cpu(sup->flags);
617 if (!c->mount_opts.override_compr) 614 if (!c->mount_opts.override_compr)
618 c->default_compr = le16_to_cpu(sup->default_compr); 615 c->default_compr = le16_to_cpu(sup->default_compr);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 7c40e6025fd6..58aa05df2bb6 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -75,7 +75,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
75 magic = le32_to_cpu(ch->magic); 75 magic = le32_to_cpu(ch->magic);
76 76
77 if (magic == 0xFFFFFFFF) { 77 if (magic == 0xFFFFFFFF) {
78 dbg_scan("hit empty space"); 78 dbg_scan("hit empty space at LEB %d:%d", lnum, offs);
79 return SCANNED_EMPTY_SPACE; 79 return SCANNED_EMPTY_SPACE;
80 } 80 }
81 81
@@ -85,7 +85,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
85 if (len < UBIFS_CH_SZ) 85 if (len < UBIFS_CH_SZ)
86 return SCANNED_GARBAGE; 86 return SCANNED_GARBAGE;
87 87
88 dbg_scan("scanning %s", dbg_ntype(ch->node_type)); 88 dbg_scan("scanning %s at LEB %d:%d",
89 dbg_ntype(ch->node_type), lnum, offs);
89 90
90 if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) 91 if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
91 return SCANNED_A_CORRUPT_NODE; 92 return SCANNED_A_CORRUPT_NODE;
@@ -114,8 +115,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
114 return SCANNED_A_BAD_PAD_NODE; 115 return SCANNED_A_BAD_PAD_NODE;
115 } 116 }
116 117
117 dbg_scan("%d bytes padded, offset now %d", 118 dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len,
118 pad_len, ALIGN(offs + node_len + pad_len, 8)); 119 lnum, offs, ALIGN(offs + node_len + pad_len, 8));
119 120
120 return node_len + pad_len; 121 return node_len + pad_len;
121 } 122 }
@@ -150,8 +151,8 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
150 151
151 err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); 152 err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0);
152 if (err && err != -EBADMSG) { 153 if (err && err != -EBADMSG) {
153 ubifs_err("cannot read %d bytes from LEB %d:%d," 154 ubifs_err("cannot read %d bytes from LEB %d:%d, error %d",
154 " error %d", c->leb_size - offs, lnum, offs, err); 155 c->leb_size - offs, lnum, offs, err);
155 kfree(sleb); 156 kfree(sleb);
156 return ERR_PTR(err); 157 return ERR_PTR(err);
157 } 158 }
@@ -240,8 +241,6 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
240 int len; 241 int len;
241 242
242 ubifs_err("corruption at LEB %d:%d", lnum, offs); 243 ubifs_err("corruption at LEB %d:%d", lnum, offs);
243 if (dbg_is_tst_rcvry(c))
244 return;
245 len = c->leb_size - offs; 244 len = c->leb_size - offs;
246 if (len > 8192) 245 if (len > 8192)
247 len = 8192; 246 len = 8192;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 71a197f0f93d..ddc0f6ae65e9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -89,9 +89,8 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
89 return 5; 89 return 5;
90 90
91 if (!ubifs_compr_present(ui->compr_type)) { 91 if (!ubifs_compr_present(ui->compr_type)) {
92 ubifs_warn("inode %lu uses '%s' compression, but it was not " 92 ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in",
93 "compiled in", inode->i_ino, 93 inode->i_ino, ubifs_compr_name(ui->compr_type));
94 ubifs_compr_name(ui->compr_type));
95 } 94 }
96 95
97 err = dbg_check_dir(c, inode); 96 err = dbg_check_dir(c, inode);
@@ -130,8 +129,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
130 129
131 inode->i_flags |= (S_NOCMTIME | S_NOATIME); 130 inode->i_flags |= (S_NOCMTIME | S_NOATIME);
132 set_nlink(inode, le32_to_cpu(ino->nlink)); 131 set_nlink(inode, le32_to_cpu(ino->nlink));
133 inode->i_uid = le32_to_cpu(ino->uid); 132 i_uid_write(inode, le32_to_cpu(ino->uid));
134 inode->i_gid = le32_to_cpu(ino->gid); 133 i_gid_write(inode, le32_to_cpu(ino->gid));
135 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); 134 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
136 inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); 135 inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
137 inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); 136 inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
@@ -1061,8 +1060,8 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
1061 1060
1062 flag = parse_standard_option(p); 1061 flag = parse_standard_option(p);
1063 if (!flag) { 1062 if (!flag) {
1064 ubifs_err("unrecognized mount option \"%s\" " 1063 ubifs_err("unrecognized mount option \"%s\" or missing value",
1065 "or missing value", p); 1064 p);
1066 return -EINVAL; 1065 return -EINVAL;
1067 } 1066 }
1068 sb->s_flags |= flag; 1067 sb->s_flags |= flag;
@@ -1124,8 +1123,8 @@ again:
1124 } 1123 }
1125 1124
1126 /* Just disable bulk-read */ 1125 /* Just disable bulk-read */
1127 ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, " 1126 ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it",
1128 "disabling it", c->max_bu_buf_len); 1127 c->max_bu_buf_len);
1129 c->mount_opts.bulk_read = 1; 1128 c->mount_opts.bulk_read = 1;
1130 c->bulk_read = 0; 1129 c->bulk_read = 0;
1131 return; 1130 return;
@@ -1161,7 +1160,7 @@ static int check_free_space(struct ubifs_info *c)
1161static int mount_ubifs(struct ubifs_info *c) 1160static int mount_ubifs(struct ubifs_info *c)
1162{ 1161{
1163 int err; 1162 int err;
1164 long long x; 1163 long long x, y;
1165 size_t sz; 1164 size_t sz;
1166 1165
1167 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); 1166 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
@@ -1411,75 +1410,69 @@ static int mount_ubifs(struct ubifs_info *c)
1411 1410
1412 c->mounting = 0; 1411 c->mounting = 0;
1413 1412
1414 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
1415 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name,
1416 if (c->ro_mount) 1415 c->ro_mount ? ", R/O mode" : NULL);
1417 ubifs_msg("mounted read-only");
1418 x = (long long)c->main_lebs * c->leb_size; 1416 x = (long long)c->main_lebs * c->leb_size;
1419 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " 1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1420 "LEBs)", x, x >> 10, x >> 20, c->main_lebs); 1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
1421 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1419 c->leb_size, c->leb_size >> 10, c->min_io_size,
1422 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1420 c->max_write_size);
1423 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1421 ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
1424 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", 1422 x, x >> 20, c->main_lebs,
1423 y, y >> 20, c->log_lebs + c->max_bud_cnt);
1424 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1425 c->report_rp_size, c->report_rp_size >> 10);
1426 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
1425 c->fmt_version, c->ro_compat_version, 1427 c->fmt_version, c->ro_compat_version,
1426 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); 1428 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid,
1427 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1429 c->big_lpt ? ", big LPT model" : ", small LPT model");
1428 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1430
1429 c->report_rp_size, c->report_rp_size >> 10); 1431 dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr));
1430 1432 dbg_gen("data journal heads: %d",
1431 dbg_msg("compiled on: " __DATE__ " at " __TIME__);
1432 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
1433 dbg_msg("max. write size: %d bytes", c->max_write_size);
1434 dbg_msg("LEB size: %d bytes (%d KiB)",
1435 c->leb_size, c->leb_size >> 10);
1436 dbg_msg("data journal heads: %d",
1437 c->jhead_cnt - NONDATA_JHEADS_CNT); 1433 c->jhead_cnt - NONDATA_JHEADS_CNT);
1438 dbg_msg("UUID: %pUB", c->uuid); 1434 dbg_gen("log LEBs: %d (%d - %d)",
1439 dbg_msg("big_lpt %d", c->big_lpt);
1440 dbg_msg("log LEBs: %d (%d - %d)",
1441 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1435 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
1442 dbg_msg("LPT area LEBs: %d (%d - %d)", 1436 dbg_gen("LPT area LEBs: %d (%d - %d)",
1443 c->lpt_lebs, c->lpt_first, c->lpt_last); 1437 c->lpt_lebs, c->lpt_first, c->lpt_last);
1444 dbg_msg("orphan area LEBs: %d (%d - %d)", 1438 dbg_gen("orphan area LEBs: %d (%d - %d)",
1445 c->orph_lebs, c->orph_first, c->orph_last); 1439 c->orph_lebs, c->orph_first, c->orph_last);
1446 dbg_msg("main area LEBs: %d (%d - %d)", 1440 dbg_gen("main area LEBs: %d (%d - %d)",
1447 c->main_lebs, c->main_first, c->leb_cnt - 1); 1441 c->main_lebs, c->main_first, c->leb_cnt - 1);
1448 dbg_msg("index LEBs: %d", c->lst.idx_lebs); 1442 dbg_gen("index LEBs: %d", c->lst.idx_lebs);
1449 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", 1443 dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)",
1450 c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, 1444 c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
1451 c->bi.old_idx_sz >> 20); 1445 c->bi.old_idx_sz >> 20);
1452 dbg_msg("key hash type: %d", c->key_hash_type); 1446 dbg_gen("key hash type: %d", c->key_hash_type);
1453 dbg_msg("tree fanout: %d", c->fanout); 1447 dbg_gen("tree fanout: %d", c->fanout);
1454 dbg_msg("reserved GC LEB: %d", c->gc_lnum); 1448 dbg_gen("reserved GC LEB: %d", c->gc_lnum);
1455 dbg_msg("first main LEB: %d", c->main_first); 1449 dbg_gen("max. znode size %d", c->max_znode_sz);
1456 dbg_msg("max. znode size %d", c->max_znode_sz); 1450 dbg_gen("max. index node size %d", c->max_idx_node_sz);
1457 dbg_msg("max. index node size %d", c->max_idx_node_sz); 1451 dbg_gen("node sizes: data %zu, inode %zu, dentry %zu",
1458 dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",
1459 UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); 1452 UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
1460 dbg_msg("node sizes: trun %zu, sb %zu, master %zu", 1453 dbg_gen("node sizes: trun %zu, sb %zu, master %zu",
1461 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); 1454 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
1462 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", 1455 dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu",
1463 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); 1456 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1464 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", 1457 dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
1465 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, 1458 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1466 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); 1459 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
1467 dbg_msg("dead watermark: %d", c->dead_wm); 1460 dbg_gen("dead watermark: %d", c->dead_wm);
1468 dbg_msg("dark watermark: %d", c->dark_wm); 1461 dbg_gen("dark watermark: %d", c->dark_wm);
1469 dbg_msg("LEB overhead: %d", c->leb_overhead); 1462 dbg_gen("LEB overhead: %d", c->leb_overhead);
1470 x = (long long)c->main_lebs * c->dark_wm; 1463 x = (long long)c->main_lebs * c->dark_wm;
1471 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", 1464 dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)",
1472 x, x >> 10, x >> 20); 1465 x, x >> 10, x >> 20);
1473 dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", 1466 dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
1474 c->max_bud_bytes, c->max_bud_bytes >> 10, 1467 c->max_bud_bytes, c->max_bud_bytes >> 10,
1475 c->max_bud_bytes >> 20); 1468 c->max_bud_bytes >> 20);
1476 dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", 1469 dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
1477 c->bg_bud_bytes, c->bg_bud_bytes >> 10, 1470 c->bg_bud_bytes, c->bg_bud_bytes >> 10,
1478 c->bg_bud_bytes >> 20); 1471 c->bg_bud_bytes >> 20);
1479 dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", 1472 dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)",
1480 c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); 1473 c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
1481 dbg_msg("max. seq. number: %llu", c->max_sqnum); 1474 dbg_gen("max. seq. number: %llu", c->max_sqnum);
1482 dbg_msg("commit number: %llu", c->cmt_no); 1475 dbg_gen("commit number: %llu", c->cmt_no);
1483 1476
1484 return 0; 1477 return 0;
1485 1478
@@ -1564,10 +1557,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1564 1557
1565 if (c->rw_incompat) { 1558 if (c->rw_incompat) {
1566 ubifs_err("the file-system is not R/W-compatible"); 1559 ubifs_err("the file-system is not R/W-compatible");
1567 ubifs_msg("on-flash format version is w%d/r%d, but software " 1560 ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
1568 "only supports up to version w%d/r%d", c->fmt_version, 1561 c->fmt_version, c->ro_compat_version,
1569 c->ro_compat_version, UBIFS_FORMAT_VERSION, 1562 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1570 UBIFS_RO_COMPAT_VERSION);
1571 return -EROFS; 1563 return -EROFS;
1572 } 1564 }
1573 1565
@@ -1828,8 +1820,8 @@ static void ubifs_put_super(struct super_block *sb)
1828 * next mount, so we just print a message and 1820 * next mount, so we just print a message and
1829 * continue to unmount normally. 1821 * continue to unmount normally.
1830 */ 1822 */
1831 ubifs_err("failed to write master node, " 1823 ubifs_err("failed to write master node, error %d",
1832 "error %d", err); 1824 err);
1833 } else { 1825 } else {
1834 for (i = 0; i < c->jhead_cnt; i++) 1826 for (i = 0; i < c->jhead_cnt; i++)
1835 /* Make sure write-buffer timers are canceled */ 1827 /* Make sure write-buffer timers are canceled */
@@ -2248,8 +2240,7 @@ static int __init ubifs_init(void)
2248 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. 2240 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
2249 */ 2241 */
2250 if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { 2242 if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
2251 ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" 2243 ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
2252 " at least 4096 bytes",
2253 (unsigned int)PAGE_CACHE_SIZE); 2244 (unsigned int)PAGE_CACHE_SIZE);
2254 return -EINVAL; 2245 return -EINVAL;
2255 } 2246 }
@@ -2298,6 +2289,12 @@ static void __exit ubifs_exit(void)
2298 dbg_debugfs_exit(); 2289 dbg_debugfs_exit();
2299 ubifs_compressors_exit(); 2290 ubifs_compressors_exit();
2300 unregister_shrinker(&ubifs_shrinker_info); 2291 unregister_shrinker(&ubifs_shrinker_info);
2292
2293 /*
2294 * Make sure all delayed rcu free inodes are flushed before we
2295 * destroy cache.
2296 */
2297 rcu_barrier();
2301 kmem_cache_destroy(ubifs_inode_slab); 2298 kmem_cache_destroy(ubifs_inode_slab);
2302 unregister_filesystem(&ubifs_fs_type); 2299 unregister_filesystem(&ubifs_fs_type);
2303} 2300}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index d38ac7f9654b..f6bf8995c7b1 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
328 case UBIFS_XENT_KEY: 328 case UBIFS_XENT_KEY:
329 break; 329 break;
330 default: 330 default:
331 dbg_msg("bad key type at slot %d: %d", 331 ubifs_err("bad key type at slot %d: %d",
332 i, key_type(c, &zbr->key)); 332 i, key_type(c, &zbr->key));
333 err = 3; 333 err = 3;
334 goto out_dump; 334 goto out_dump;
335 } 335 }
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 1e5a08623d11..5486346d0a3f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,16 +42,15 @@
42#define UBIFS_VERSION 1 42#define UBIFS_VERSION 1
43 43
44/* Normal UBIFS messages */ 44/* Normal UBIFS messages */
45#define ubifs_msg(fmt, ...) \ 45#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__)
46 printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
47/* UBIFS error messages */ 46/* UBIFS error messages */
48#define ubifs_err(fmt, ...) \ 47#define ubifs_err(fmt, ...) \
49 printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ 48 pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
50 __func__, ##__VA_ARGS__) 49 __func__, ##__VA_ARGS__)
51/* UBIFS warning messages */ 50/* UBIFS warning messages */
52#define ubifs_warn(fmt, ...) \ 51#define ubifs_warn(fmt, ...) \
53 printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ 52 pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \
54 current->pid, __func__, ##__VA_ARGS__) 53 current->pid, __func__, ##__VA_ARGS__)
55 54
56/* UBIFS file system VFS magic number */ 55/* UBIFS file system VFS magic number */
57#define UBIFS_SUPER_MAGIC 0x24051905 56#define UBIFS_SUPER_MAGIC 0x24051905
@@ -1426,8 +1425,8 @@ struct ubifs_info {
1426 1425
1427 long long rp_size; 1426 long long rp_size;
1428 long long report_rp_size; 1427 long long report_rp_size;
1429 uid_t rp_uid; 1428 kuid_t rp_uid;
1430 gid_t rp_gid; 1429 kgid_t rp_gid;
1431 1430
1432 /* The below fields are used only during mounting and re-mounting */ 1431 /* The below fields are used only during mounting and re-mounting */
1433 unsigned int empty:1; 1432 unsigned int empty:1;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7f3f7ba3df6e..77b5953eaac8 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,20 +39,24 @@
39#include "udf_i.h" 39#include "udf_i.h"
40#include "udf_sb.h" 40#include "udf_sb.h"
41 41
42static int udf_adinicb_readpage(struct file *file, struct page *page) 42static void __udf_adinicb_readpage(struct page *page)
43{ 43{
44 struct inode *inode = page->mapping->host; 44 struct inode *inode = page->mapping->host;
45 char *kaddr; 45 char *kaddr;
46 struct udf_inode_info *iinfo = UDF_I(inode); 46 struct udf_inode_info *iinfo = UDF_I(inode);
47 47
48 BUG_ON(!PageLocked(page));
49
50 kaddr = kmap(page); 48 kaddr = kmap(page);
51 memset(kaddr, 0, PAGE_CACHE_SIZE);
52 memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); 49 memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
50 memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
53 flush_dcache_page(page); 51 flush_dcache_page(page);
54 SetPageUptodate(page); 52 SetPageUptodate(page);
55 kunmap(page); 53 kunmap(page);
54}
55
56static int udf_adinicb_readpage(struct file *file, struct page *page)
57{
58 BUG_ON(!PageLocked(page));
59 __udf_adinicb_readpage(page);
56 unlock_page(page); 60 unlock_page(page);
57 61
58 return 0; 62 return 0;
@@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page,
77 return 0; 81 return 0;
78} 82}
79 83
84static int udf_adinicb_write_begin(struct file *file,
85 struct address_space *mapping, loff_t pos,
86 unsigned len, unsigned flags, struct page **pagep,
87 void **fsdata)
88{
89 struct page *page;
90
91 if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
92 return -EIO;
93 page = grab_cache_page_write_begin(mapping, 0, flags);
94 if (!page)
95 return -ENOMEM;
96 *pagep = page;
97
98 if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
99 __udf_adinicb_readpage(page);
100 return 0;
101}
102
80static int udf_adinicb_write_end(struct file *file, 103static int udf_adinicb_write_end(struct file *file,
81 struct address_space *mapping, 104 struct address_space *mapping,
82 loff_t pos, unsigned len, unsigned copied, 105 loff_t pos, unsigned len, unsigned copied,
@@ -95,11 +118,20 @@ static int udf_adinicb_write_end(struct file *file,
95 return simple_write_end(file, mapping, pos, len, copied, page, fsdata); 118 return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
96} 119}
97 120
121static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
122 const struct iovec *iov,
123 loff_t offset, unsigned long nr_segs)
124{
125 /* Fallback to buffered I/O. */
126 return 0;
127}
128
98const struct address_space_operations udf_adinicb_aops = { 129const struct address_space_operations udf_adinicb_aops = {
99 .readpage = udf_adinicb_readpage, 130 .readpage = udf_adinicb_readpage,
100 .writepage = udf_adinicb_writepage, 131 .writepage = udf_adinicb_writepage,
101 .write_begin = simple_write_begin, 132 .write_begin = udf_adinicb_write_begin,
102 .write_end = udf_adinicb_write_end, 133 .write_end = udf_adinicb_write_end,
134 .direct_IO = udf_adinicb_direct_IO,
103}; 135};
104 136
105static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 137static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index aa233469b3c1..df88b957ccf0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -95,11 +95,33 @@ void udf_evict_inode(struct inode *inode)
95 } 95 }
96} 96}
97 97
98static void udf_write_failed(struct address_space *mapping, loff_t to)
99{
100 struct inode *inode = mapping->host;
101 struct udf_inode_info *iinfo = UDF_I(inode);
102 loff_t isize = inode->i_size;
103
104 if (to > isize) {
105 truncate_pagecache(inode, to, isize);
106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
107 down_write(&iinfo->i_data_sem);
108 udf_truncate_extents(inode);
109 up_write(&iinfo->i_data_sem);
110 }
111 }
112}
113
98static int udf_writepage(struct page *page, struct writeback_control *wbc) 114static int udf_writepage(struct page *page, struct writeback_control *wbc)
99{ 115{
100 return block_write_full_page(page, udf_get_block, wbc); 116 return block_write_full_page(page, udf_get_block, wbc);
101} 117}
102 118
119static int udf_writepages(struct address_space *mapping,
120 struct writeback_control *wbc)
121{
122 return mpage_writepages(mapping, wbc, udf_get_block);
123}
124
103static int udf_readpage(struct file *file, struct page *page) 125static int udf_readpage(struct file *file, struct page *page)
104{ 126{
105 return mpage_readpage(page, udf_get_block); 127 return mpage_readpage(page, udf_get_block);
@@ -118,21 +140,24 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
118 int ret; 140 int ret;
119 141
120 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); 142 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
121 if (unlikely(ret)) { 143 if (unlikely(ret))
122 struct inode *inode = mapping->host; 144 udf_write_failed(mapping, pos + len);
123 struct udf_inode_info *iinfo = UDF_I(inode); 145 return ret;
124 loff_t isize = inode->i_size; 146}
125
126 if (pos + len > isize) {
127 truncate_pagecache(inode, pos + len, isize);
128 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
129 down_write(&iinfo->i_data_sem);
130 udf_truncate_extents(inode);
131 up_write(&iinfo->i_data_sem);
132 }
133 }
134 }
135 147
148static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
149 const struct iovec *iov,
150 loff_t offset, unsigned long nr_segs)
151{
152 struct file *file = iocb->ki_filp;
153 struct address_space *mapping = file->f_mapping;
154 struct inode *inode = mapping->host;
155 ssize_t ret;
156
157 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
158 udf_get_block);
159 if (unlikely(ret < 0 && (rw & WRITE)))
160 udf_write_failed(mapping, offset + iov_length(iov, nr_segs));
136 return ret; 161 return ret;
137} 162}
138 163
@@ -145,8 +170,10 @@ const struct address_space_operations udf_aops = {
145 .readpage = udf_readpage, 170 .readpage = udf_readpage,
146 .readpages = udf_readpages, 171 .readpages = udf_readpages,
147 .writepage = udf_writepage, 172 .writepage = udf_writepage,
148 .write_begin = udf_write_begin, 173 .writepages = udf_writepages,
149 .write_end = generic_write_end, 174 .write_begin = udf_write_begin,
175 .write_end = generic_write_end,
176 .direct_IO = udf_direct_IO,
150 .bmap = udf_bmap, 177 .bmap = udf_bmap,
151}; 178};
152 179
@@ -1312,14 +1339,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1312 } 1339 }
1313 1340
1314 read_lock(&sbi->s_cred_lock); 1341 read_lock(&sbi->s_cred_lock);
1315 inode->i_uid = le32_to_cpu(fe->uid); 1342 i_uid_write(inode, le32_to_cpu(fe->uid));
1316 if (inode->i_uid == -1 || 1343 if (!uid_valid(inode->i_uid) ||
1317 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || 1344 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
1318 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) 1345 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET))
1319 inode->i_uid = UDF_SB(inode->i_sb)->s_uid; 1346 inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
1320 1347
1321 inode->i_gid = le32_to_cpu(fe->gid); 1348 i_gid_write(inode, le32_to_cpu(fe->gid));
1322 if (inode->i_gid == -1 || 1349 if (!gid_valid(inode->i_gid) ||
1323 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || 1350 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) ||
1324 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) 1351 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
1325 inode->i_gid = UDF_SB(inode->i_sb)->s_gid; 1352 inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
@@ -1542,12 +1569,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1542 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) 1569 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
1543 fe->uid = cpu_to_le32(-1); 1570 fe->uid = cpu_to_le32(-1);
1544 else 1571 else
1545 fe->uid = cpu_to_le32(inode->i_uid); 1572 fe->uid = cpu_to_le32(i_uid_read(inode));
1546 1573
1547 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) 1574 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
1548 fe->gid = cpu_to_le32(-1); 1575 fe->gid = cpu_to_le32(-1);
1549 else 1576 else
1550 fe->gid = cpu_to_le32(inode->i_gid); 1577 fe->gid = cpu_to_le32(i_gid_read(inode));
1551 1578
1552 udfperms = ((inode->i_mode & S_IRWXO)) | 1579 udfperms = ((inode->i_mode & S_IRWXO)) |
1553 ((inode->i_mode & S_IRWXG) << 2) | 1580 ((inode->i_mode & S_IRWXG) << 2) |
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 18fc038a438d..d44fb568abe1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -171,6 +171,11 @@ static int init_inodecache(void)
171 171
172static void destroy_inodecache(void) 172static void destroy_inodecache(void)
173{ 173{
174 /*
175 * Make sure all delayed rcu free inodes are flushed before we
176 * destroy cache.
177 */
178 rcu_barrier();
174 kmem_cache_destroy(udf_inode_cachep); 179 kmem_cache_destroy(udf_inode_cachep);
175} 180}
176 181
@@ -199,8 +204,8 @@ struct udf_options {
199 unsigned int rootdir; 204 unsigned int rootdir;
200 unsigned int flags; 205 unsigned int flags;
201 umode_t umask; 206 umode_t umask;
202 gid_t gid; 207 kgid_t gid;
203 uid_t uid; 208 kuid_t uid;
204 umode_t fmode; 209 umode_t fmode;
205 umode_t dmode; 210 umode_t dmode;
206 struct nls_table *nls_map; 211 struct nls_table *nls_map;
@@ -335,9 +340,9 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
335 if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) 340 if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE))
336 seq_puts(seq, ",gid=ignore"); 341 seq_puts(seq, ",gid=ignore");
337 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) 342 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
338 seq_printf(seq, ",uid=%u", sbi->s_uid); 343 seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid));
339 if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) 344 if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
340 seq_printf(seq, ",gid=%u", sbi->s_gid); 345 seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid));
341 if (sbi->s_umask != 0) 346 if (sbi->s_umask != 0)
342 seq_printf(seq, ",umask=%ho", sbi->s_umask); 347 seq_printf(seq, ",umask=%ho", sbi->s_umask);
343 if (sbi->s_fmode != UDF_INVALID_MODE) 348 if (sbi->s_fmode != UDF_INVALID_MODE)
@@ -516,13 +521,17 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
516 case Opt_gid: 521 case Opt_gid:
517 if (match_int(args, &option)) 522 if (match_int(args, &option))
518 return 0; 523 return 0;
519 uopt->gid = option; 524 uopt->gid = make_kgid(current_user_ns(), option);
525 if (!gid_valid(uopt->gid))
526 return 0;
520 uopt->flags |= (1 << UDF_FLAG_GID_SET); 527 uopt->flags |= (1 << UDF_FLAG_GID_SET);
521 break; 528 break;
522 case Opt_uid: 529 case Opt_uid:
523 if (match_int(args, &option)) 530 if (match_int(args, &option))
524 return 0; 531 return 0;
525 uopt->uid = option; 532 uopt->uid = make_kuid(current_user_ns(), option);
533 if (!uid_valid(uopt->uid))
534 return 0;
526 uopt->flags |= (1 << UDF_FLAG_UID_SET); 535 uopt->flags |= (1 << UDF_FLAG_UID_SET);
527 break; 536 break;
528 case Opt_umask: 537 case Opt_umask:
@@ -1934,8 +1943,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1934 struct udf_sb_info *sbi; 1943 struct udf_sb_info *sbi;
1935 1944
1936 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1945 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1937 uopt.uid = -1; 1946 uopt.uid = INVALID_UID;
1938 uopt.gid = -1; 1947 uopt.gid = INVALID_GID;
1939 uopt.umask = 0; 1948 uopt.umask = 0;
1940 uopt.fmode = UDF_INVALID_MODE; 1949 uopt.fmode = UDF_INVALID_MODE;
1941 uopt.dmode = UDF_INVALID_MODE; 1950 uopt.dmode = UDF_INVALID_MODE;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 42ad69ac9576..5f027227f085 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -128,8 +128,8 @@ struct udf_sb_info {
128 128
129 /* Default permissions */ 129 /* Default permissions */
130 umode_t s_umask; 130 umode_t s_umask;
131 gid_t s_gid; 131 kgid_t s_gid;
132 uid_t s_uid; 132 kuid_t s_uid;
133 umode_t s_fmode; 133 umode_t s_fmode;
134 umode_t s_dmode; 134 umode_t s_dmode;
135 /* Lock protecting consistency of above permission settings */ 135 /* Lock protecting consistency of above permission settings */
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 1b3e410bf334..a7ea492ae660 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -54,7 +54,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
54 if (ufs_fragnum(fragment) + count > uspi->s_fpg) 54 if (ufs_fragnum(fragment) + count > uspi->s_fpg)
55 ufs_error (sb, "ufs_free_fragments", "internal error"); 55 ufs_error (sb, "ufs_free_fragments", "internal error");
56 56
57 lock_super(sb); 57 mutex_lock(&UFS_SB(sb)->s_lock);
58 58
59 cgno = ufs_dtog(uspi, fragment); 59 cgno = ufs_dtog(uspi, fragment);
60 bit = ufs_dtogd(uspi, fragment); 60 bit = ufs_dtogd(uspi, fragment);
@@ -118,12 +118,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
118 ubh_sync_block(UCPI_UBH(ucpi)); 118 ubh_sync_block(UCPI_UBH(ucpi));
119 ufs_mark_sb_dirty(sb); 119 ufs_mark_sb_dirty(sb);
120 120
121 unlock_super (sb); 121 mutex_unlock(&UFS_SB(sb)->s_lock);
122 UFSD("EXIT\n"); 122 UFSD("EXIT\n");
123 return; 123 return;
124 124
125failed: 125failed:
126 unlock_super (sb); 126 mutex_unlock(&UFS_SB(sb)->s_lock);
127 UFSD("EXIT (FAILED)\n"); 127 UFSD("EXIT (FAILED)\n");
128 return; 128 return;
129} 129}
@@ -155,7 +155,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
155 goto failed; 155 goto failed;
156 } 156 }
157 157
158 lock_super(sb); 158 mutex_lock(&UFS_SB(sb)->s_lock);
159 159
160do_more: 160do_more:
161 overflow = 0; 161 overflow = 0;
@@ -215,12 +215,12 @@ do_more:
215 } 215 }
216 216
217 ufs_mark_sb_dirty(sb); 217 ufs_mark_sb_dirty(sb);
218 unlock_super (sb); 218 mutex_unlock(&UFS_SB(sb)->s_lock);
219 UFSD("EXIT\n"); 219 UFSD("EXIT\n");
220 return; 220 return;
221 221
222failed_unlock: 222failed_unlock:
223 unlock_super (sb); 223 mutex_unlock(&UFS_SB(sb)->s_lock);
224failed: 224failed:
225 UFSD("EXIT (FAILED)\n"); 225 UFSD("EXIT (FAILED)\n");
226 return; 226 return;
@@ -361,7 +361,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
361 usb1 = ubh_get_usb_first(uspi); 361 usb1 = ubh_get_usb_first(uspi);
362 *err = -ENOSPC; 362 *err = -ENOSPC;
363 363
364 lock_super (sb); 364 mutex_lock(&UFS_SB(sb)->s_lock);
365 tmp = ufs_data_ptr_to_cpu(sb, p); 365 tmp = ufs_data_ptr_to_cpu(sb, p);
366 366
367 if (count + ufs_fragnum(fragment) > uspi->s_fpb) { 367 if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -382,19 +382,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
382 "fragment %llu, tmp %llu\n", 382 "fragment %llu, tmp %llu\n",
383 (unsigned long long)fragment, 383 (unsigned long long)fragment,
384 (unsigned long long)tmp); 384 (unsigned long long)tmp);
385 unlock_super(sb); 385 mutex_unlock(&UFS_SB(sb)->s_lock);
386 return INVBLOCK; 386 return INVBLOCK;
387 } 387 }
388 if (fragment < UFS_I(inode)->i_lastfrag) { 388 if (fragment < UFS_I(inode)->i_lastfrag) {
389 UFSD("EXIT (ALREADY ALLOCATED)\n"); 389 UFSD("EXIT (ALREADY ALLOCATED)\n");
390 unlock_super (sb); 390 mutex_unlock(&UFS_SB(sb)->s_lock);
391 return 0; 391 return 0;
392 } 392 }
393 } 393 }
394 else { 394 else {
395 if (tmp) { 395 if (tmp) {
396 UFSD("EXIT (ALREADY ALLOCATED)\n"); 396 UFSD("EXIT (ALREADY ALLOCATED)\n");
397 unlock_super(sb); 397 mutex_unlock(&UFS_SB(sb)->s_lock);
398 return 0; 398 return 0;
399 } 399 }
400 } 400 }
@@ -403,7 +403,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
403 * There is not enough space for user on the device 403 * There is not enough space for user on the device
404 */ 404 */
405 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { 405 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
406 unlock_super (sb); 406 mutex_unlock(&UFS_SB(sb)->s_lock);
407 UFSD("EXIT (FAILED)\n"); 407 UFSD("EXIT (FAILED)\n");
408 return 0; 408 return 0;
409 } 409 }
@@ -428,7 +428,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
428 ufs_clear_frags(inode, result + oldcount, 428 ufs_clear_frags(inode, result + oldcount,
429 newcount - oldcount, locked_page != NULL); 429 newcount - oldcount, locked_page != NULL);
430 } 430 }
431 unlock_super(sb); 431 mutex_unlock(&UFS_SB(sb)->s_lock);
432 UFSD("EXIT, result %llu\n", (unsigned long long)result); 432 UFSD("EXIT, result %llu\n", (unsigned long long)result);
433 return result; 433 return result;
434 } 434 }
@@ -443,7 +443,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
443 fragment + count); 443 fragment + count);
444 ufs_clear_frags(inode, result + oldcount, newcount - oldcount, 444 ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
445 locked_page != NULL); 445 locked_page != NULL);
446 unlock_super(sb); 446 mutex_unlock(&UFS_SB(sb)->s_lock);
447 UFSD("EXIT, result %llu\n", (unsigned long long)result); 447 UFSD("EXIT, result %llu\n", (unsigned long long)result);
448 return result; 448 return result;
449 } 449 }
@@ -481,7 +481,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
481 *err = 0; 481 *err = 0;
482 UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, 482 UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
483 fragment + count); 483 fragment + count);
484 unlock_super(sb); 484 mutex_unlock(&UFS_SB(sb)->s_lock);
485 if (newcount < request) 485 if (newcount < request)
486 ufs_free_fragments (inode, result + newcount, request - newcount); 486 ufs_free_fragments (inode, result + newcount, request - newcount);
487 ufs_free_fragments (inode, tmp, oldcount); 487 ufs_free_fragments (inode, tmp, oldcount);
@@ -489,7 +489,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
489 return result; 489 return result;
490 } 490 }
491 491
492 unlock_super(sb); 492 mutex_unlock(&UFS_SB(sb)->s_lock);
493 UFSD("EXIT (FAILED)\n"); 493 UFSD("EXIT (FAILED)\n");
494 return 0; 494 return 0;
495} 495}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index e84cbe21b986..d0426d74817b 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -71,11 +71,11 @@ void ufs_free_inode (struct inode * inode)
71 71
72 ino = inode->i_ino; 72 ino = inode->i_ino;
73 73
74 lock_super (sb); 74 mutex_lock(&UFS_SB(sb)->s_lock);
75 75
76 if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { 76 if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
77 ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); 77 ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
78 unlock_super (sb); 78 mutex_unlock(&UFS_SB(sb)->s_lock);
79 return; 79 return;
80 } 80 }
81 81
@@ -83,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
83 bit = ufs_inotocgoff (ino); 83 bit = ufs_inotocgoff (ino);
84 ucpi = ufs_load_cylinder (sb, cg); 84 ucpi = ufs_load_cylinder (sb, cg);
85 if (!ucpi) { 85 if (!ucpi) {
86 unlock_super (sb); 86 mutex_unlock(&UFS_SB(sb)->s_lock);
87 return; 87 return;
88 } 88 }
89 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 89 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -117,7 +117,7 @@ void ufs_free_inode (struct inode * inode)
117 ubh_sync_block(UCPI_UBH(ucpi)); 117 ubh_sync_block(UCPI_UBH(ucpi));
118 118
119 ufs_mark_sb_dirty(sb); 119 ufs_mark_sb_dirty(sb);
120 unlock_super (sb); 120 mutex_unlock(&UFS_SB(sb)->s_lock);
121 UFSD("EXIT\n"); 121 UFSD("EXIT\n");
122} 122}
123 123
@@ -197,7 +197,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
197 uspi = sbi->s_uspi; 197 uspi = sbi->s_uspi;
198 usb1 = ubh_get_usb_first(uspi); 198 usb1 = ubh_get_usb_first(uspi);
199 199
200 lock_super (sb); 200 mutex_lock(&sbi->s_lock);
201 201
202 /* 202 /*
203 * Try to place the inode in its parent directory 203 * Try to place the inode in its parent directory
@@ -333,20 +333,20 @@ cg_found:
333 brelse(bh); 333 brelse(bh);
334 } 334 }
335 335
336 unlock_super (sb); 336 mutex_unlock(&sbi->s_lock);
337 337
338 UFSD("allocating inode %lu\n", inode->i_ino); 338 UFSD("allocating inode %lu\n", inode->i_ino);
339 UFSD("EXIT\n"); 339 UFSD("EXIT\n");
340 return inode; 340 return inode;
341 341
342fail_remove_inode: 342fail_remove_inode:
343 unlock_super(sb); 343 mutex_unlock(&sbi->s_lock);
344 clear_nlink(inode); 344 clear_nlink(inode);
345 iput(inode); 345 iput(inode);
346 UFSD("EXIT (FAILED): err %d\n", err); 346 UFSD("EXIT (FAILED): err %d\n", err);
347 return ERR_PTR(err); 347 return ERR_PTR(err);
348failed: 348failed:
349 unlock_super (sb); 349 mutex_unlock(&sbi->s_lock);
350 make_bad_inode(inode); 350 make_bad_inode(inode);
351 iput (inode); 351 iput (inode);
352 UFSD("EXIT (FAILED): err %d\n", err); 352 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index dd7c89d8a1c1..eb6d0b7dc879 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -597,8 +597,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
597 /* 597 /*
598 * Linux now has 32-bit uid and gid, so we can support EFT. 598 * Linux now has 32-bit uid and gid, so we can support EFT.
599 */ 599 */
600 inode->i_uid = ufs_get_inode_uid(sb, ufs_inode); 600 i_uid_write(inode, ufs_get_inode_uid(sb, ufs_inode));
601 inode->i_gid = ufs_get_inode_gid(sb, ufs_inode); 601 i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
602 602
603 inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); 603 inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
604 inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); 604 inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
@@ -645,8 +645,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
645 /* 645 /*
646 * Linux now has 32-bit uid and gid, so we can support EFT. 646 * Linux now has 32-bit uid and gid, so we can support EFT.
647 */ 647 */
648 inode->i_uid = fs32_to_cpu(sb, ufs2_inode->ui_uid); 648 i_uid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_uid));
649 inode->i_gid = fs32_to_cpu(sb, ufs2_inode->ui_gid); 649 i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
650 650
651 inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); 651 inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
652 inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); 652 inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
@@ -745,8 +745,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
745 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); 745 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
746 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); 746 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
747 747
748 ufs_set_inode_uid(sb, ufs_inode, inode->i_uid); 748 ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode));
749 ufs_set_inode_gid(sb, ufs_inode, inode->i_gid); 749 ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
750 750
751 ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); 751 ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
752 ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); 752 ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
@@ -789,8 +789,8 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
789 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); 789 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
790 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); 790 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
791 791
792 ufs_inode->ui_uid = cpu_to_fs32(sb, inode->i_uid); 792 ufs_inode->ui_uid = cpu_to_fs32(sb, i_uid_read(inode));
793 ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid); 793 ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
794 794
795 ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); 795 ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
796 ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); 796 ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 444927e5706b..dc8e3a861d0f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -699,7 +699,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
699 unsigned flags; 699 unsigned flags;
700 700
701 lock_ufs(sb); 701 lock_ufs(sb);
702 lock_super(sb); 702 mutex_lock(&UFS_SB(sb)->s_lock);
703 703
704 UFSD("ENTER\n"); 704 UFSD("ENTER\n");
705 705
@@ -717,7 +717,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
717 ufs_put_cstotal(sb); 717 ufs_put_cstotal(sb);
718 718
719 UFSD("EXIT\n"); 719 UFSD("EXIT\n");
720 unlock_super(sb); 720 mutex_unlock(&UFS_SB(sb)->s_lock);
721 unlock_ufs(sb); 721 unlock_ufs(sb);
722 722
723 return 0; 723 return 0;
@@ -805,6 +805,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
805 } 805 }
806#endif 806#endif
807 mutex_init(&sbi->mutex); 807 mutex_init(&sbi->mutex);
808 mutex_init(&sbi->s_lock);
808 spin_lock_init(&sbi->work_lock); 809 spin_lock_init(&sbi->work_lock);
809 INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); 810 INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
810 /* 811 /*
@@ -1280,7 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 unsigned flags; 1281 unsigned flags;
1281 1282
1282 lock_ufs(sb); 1283 lock_ufs(sb);
1283 lock_super(sb); 1284 mutex_lock(&UFS_SB(sb)->s_lock);
1284 uspi = UFS_SB(sb)->s_uspi; 1285 uspi = UFS_SB(sb)->s_uspi;
1285 flags = UFS_SB(sb)->s_flags; 1286 flags = UFS_SB(sb)->s_flags;
1286 usb1 = ubh_get_usb_first(uspi); 1287 usb1 = ubh_get_usb_first(uspi);
@@ -1294,7 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1294 new_mount_opt = 0; 1295 new_mount_opt = 0;
1295 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1296 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1296 if (!ufs_parse_options (data, &new_mount_opt)) { 1297 if (!ufs_parse_options (data, &new_mount_opt)) {
1297 unlock_super(sb); 1298 mutex_unlock(&UFS_SB(sb)->s_lock);
1298 unlock_ufs(sb); 1299 unlock_ufs(sb);
1299 return -EINVAL; 1300 return -EINVAL;
1300 } 1301 }
@@ -1302,14 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1302 new_mount_opt |= ufstype; 1303 new_mount_opt |= ufstype;
1303 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1304 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1304 printk("ufstype can't be changed during remount\n"); 1305 printk("ufstype can't be changed during remount\n");
1305 unlock_super(sb); 1306 mutex_unlock(&UFS_SB(sb)->s_lock);
1306 unlock_ufs(sb); 1307 unlock_ufs(sb);
1307 return -EINVAL; 1308 return -EINVAL;
1308 } 1309 }
1309 1310
1310 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1311 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1311 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1312 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1312 unlock_super(sb); 1313 mutex_unlock(&UFS_SB(sb)->s_lock);
1313 unlock_ufs(sb); 1314 unlock_ufs(sb);
1314 return 0; 1315 return 0;
1315 } 1316 }
@@ -1334,7 +1335,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1334#ifndef CONFIG_UFS_FS_WRITE 1335#ifndef CONFIG_UFS_FS_WRITE
1335 printk("ufs was compiled with read-only support, " 1336 printk("ufs was compiled with read-only support, "
1336 "can't be mounted as read-write\n"); 1337 "can't be mounted as read-write\n");
1337 unlock_super(sb); 1338 mutex_unlock(&UFS_SB(sb)->s_lock);
1338 unlock_ufs(sb); 1339 unlock_ufs(sb);
1339 return -EINVAL; 1340 return -EINVAL;
1340#else 1341#else
@@ -1344,13 +1345,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1344 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && 1345 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
1345 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1346 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1346 printk("this ufstype is read-only supported\n"); 1347 printk("this ufstype is read-only supported\n");
1347 unlock_super(sb); 1348 mutex_unlock(&UFS_SB(sb)->s_lock);
1348 unlock_ufs(sb); 1349 unlock_ufs(sb);
1349 return -EINVAL; 1350 return -EINVAL;
1350 } 1351 }
1351 if (!ufs_read_cylinder_structures(sb)) { 1352 if (!ufs_read_cylinder_structures(sb)) {
1352 printk("failed during remounting\n"); 1353 printk("failed during remounting\n");
1353 unlock_super(sb); 1354 mutex_unlock(&UFS_SB(sb)->s_lock);
1354 unlock_ufs(sb); 1355 unlock_ufs(sb);
1355 return -EPERM; 1356 return -EPERM;
1356 } 1357 }
@@ -1358,7 +1359,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1358#endif 1359#endif
1359 } 1360 }
1360 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1361 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1361 unlock_super(sb); 1362 mutex_unlock(&UFS_SB(sb)->s_lock);
1362 unlock_ufs(sb); 1363 unlock_ufs(sb);
1363 return 0; 1364 return 0;
1364} 1365}
@@ -1466,6 +1467,11 @@ static int init_inodecache(void)
1466 1467
1467static void destroy_inodecache(void) 1468static void destroy_inodecache(void)
1468{ 1469{
1470 /*
1471 * Make sure all delayed rcu free inodes are flushed before we
1472 * destroy cache.
1473 */
1474 rcu_barrier();
1469 kmem_cache_destroy(ufs_inode_cachep); 1475 kmem_cache_destroy(ufs_inode_cachep);
1470} 1476}
1471 1477
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..ff2c15ab81aa 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,6 +24,7 @@ struct ufs_sb_info {
24 int work_queued; /* non-zero if the delayed work is queued */ 24 int work_queued; /* non-zero if the delayed work is queued */
25 struct delayed_work sync_work; /* FS sync delayed work */ 25 struct delayed_work sync_work; /* FS sync delayed work */
26 spinlock_t work_lock; /* protects sync_work and work_queued */ 26 spinlock_t work_lock; /* protects sync_work and work_queued */
27 struct mutex s_lock;
27}; 28};
28 29
29struct ufs_inode_info { 30struct ufs_inode_info {
diff --git a/fs/utimes.c b/fs/utimes.c
index fa4dbe451e27..bb0696a41735 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -140,19 +140,18 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
140 goto out; 140 goto out;
141 141
142 if (filename == NULL && dfd != AT_FDCWD) { 142 if (filename == NULL && dfd != AT_FDCWD) {
143 int fput_needed; 143 struct fd f;
144 struct file *file;
145 144
146 if (flags & AT_SYMLINK_NOFOLLOW) 145 if (flags & AT_SYMLINK_NOFOLLOW)
147 goto out; 146 goto out;
148 147
149 file = fget_light(dfd, &fput_needed); 148 f = fdget(dfd);
150 error = -EBADF; 149 error = -EBADF;
151 if (!file) 150 if (!f.file)
152 goto out; 151 goto out;
153 152
154 error = utimes_common(&file->f_path, times); 153 error = utimes_common(&f.file->f_path, times);
155 fput_light(file, fput_needed); 154 fdput(f);
156 } else { 155 } else {
157 struct path path; 156 struct path path;
158 int lookup_flags = 0; 157 int lookup_flags = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4d45b7189e7e..e21c119f4f99 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -20,6 +20,7 @@
20#include <linux/fsnotify.h> 20#include <linux/fsnotify.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/posix_acl_xattr.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25 26
@@ -295,11 +296,13 @@ vfs_removexattr(struct dentry *dentry, const char *name)
295 if (error) 296 if (error)
296 return error; 297 return error;
297 298
299 mutex_lock(&inode->i_mutex);
298 error = security_inode_removexattr(dentry, name); 300 error = security_inode_removexattr(dentry, name);
299 if (error) 301 if (error) {
302 mutex_unlock(&inode->i_mutex);
300 return error; 303 return error;
304 }
301 305
302 mutex_lock(&inode->i_mutex);
303 error = inode->i_op->removexattr(dentry, name); 306 error = inode->i_op->removexattr(dentry, name);
304 mutex_unlock(&inode->i_mutex); 307 mutex_unlock(&inode->i_mutex);
305 308
@@ -347,6 +350,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
347 error = -EFAULT; 350 error = -EFAULT;
348 goto out; 351 goto out;
349 } 352 }
353 if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
354 (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
355 posix_acl_fix_xattr_from_user(kvalue, size);
350 } 356 }
351 357
352 error = vfs_setxattr(d, kname, kvalue, size, flags); 358 error = vfs_setxattr(d, kname, kvalue, size, flags);
@@ -399,22 +405,20 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
399SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, 405SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
400 const void __user *,value, size_t, size, int, flags) 406 const void __user *,value, size_t, size, int, flags)
401{ 407{
402 int fput_needed; 408 struct fd f = fdget(fd);
403 struct file *f;
404 struct dentry *dentry; 409 struct dentry *dentry;
405 int error = -EBADF; 410 int error = -EBADF;
406 411
407 f = fget_light(fd, &fput_needed); 412 if (!f.file)
408 if (!f)
409 return error; 413 return error;
410 dentry = f->f_path.dentry; 414 dentry = f.file->f_path.dentry;
411 audit_inode(NULL, dentry); 415 audit_inode(NULL, dentry, 0);
412 error = mnt_want_write_file(f); 416 error = mnt_want_write_file(f.file);
413 if (!error) { 417 if (!error) {
414 error = setxattr(dentry, name, value, size, flags); 418 error = setxattr(dentry, name, value, size, flags);
415 mnt_drop_write_file(f); 419 mnt_drop_write_file(f.file);
416 } 420 }
417 fput_light(f, fput_needed); 421 fdput(f);
418 return error; 422 return error;
419} 423}
420 424
@@ -450,6 +454,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
450 454
451 error = vfs_getxattr(d, kname, kvalue, size); 455 error = vfs_getxattr(d, kname, kvalue, size);
452 if (error > 0) { 456 if (error > 0) {
457 if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
458 (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
459 posix_acl_fix_xattr_to_user(kvalue, size);
453 if (size && copy_to_user(value, kvalue, error)) 460 if (size && copy_to_user(value, kvalue, error))
454 error = -EFAULT; 461 error = -EFAULT;
455 } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { 462 } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
@@ -495,16 +502,14 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
495SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, 502SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
496 void __user *, value, size_t, size) 503 void __user *, value, size_t, size)
497{ 504{
498 int fput_needed; 505 struct fd f = fdget(fd);
499 struct file *f;
500 ssize_t error = -EBADF; 506 ssize_t error = -EBADF;
501 507
502 f = fget_light(fd, &fput_needed); 508 if (!f.file)
503 if (!f)
504 return error; 509 return error;
505 audit_inode(NULL, f->f_path.dentry); 510 audit_inode(NULL, f.file->f_path.dentry, 0);
506 error = getxattr(f->f_path.dentry, name, value, size); 511 error = getxattr(f.file->f_path.dentry, name, value, size);
507 fput_light(f, fput_needed); 512 fdput(f);
508 return error; 513 return error;
509} 514}
510 515
@@ -576,16 +581,14 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
576 581
577SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) 582SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
578{ 583{
579 int fput_needed; 584 struct fd f = fdget(fd);
580 struct file *f;
581 ssize_t error = -EBADF; 585 ssize_t error = -EBADF;
582 586
583 f = fget_light(fd, &fput_needed); 587 if (!f.file)
584 if (!f)
585 return error; 588 return error;
586 audit_inode(NULL, f->f_path.dentry); 589 audit_inode(NULL, f.file->f_path.dentry, 0);
587 error = listxattr(f->f_path.dentry, list, size); 590 error = listxattr(f.file->f_path.dentry, list, size);
588 fput_light(f, fput_needed); 591 fdput(f);
589 return error; 592 return error;
590} 593}
591 594
@@ -645,22 +648,20 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
645 648
646SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) 649SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
647{ 650{
648 int fput_needed; 651 struct fd f = fdget(fd);
649 struct file *f;
650 struct dentry *dentry; 652 struct dentry *dentry;
651 int error = -EBADF; 653 int error = -EBADF;
652 654
653 f = fget_light(fd, &fput_needed); 655 if (!f.file)
654 if (!f)
655 return error; 656 return error;
656 dentry = f->f_path.dentry; 657 dentry = f.file->f_path.dentry;
657 audit_inode(NULL, dentry); 658 audit_inode(NULL, dentry, 0);
658 error = mnt_want_write_file(f); 659 error = mnt_want_write_file(f.file);
659 if (!error) { 660 if (!error) {
660 error = removexattr(dentry, name); 661 error = removexattr(dentry, name);
661 mnt_drop_write_file(f); 662 mnt_drop_write_file(f.file);
662 } 663 }
663 fput_light(f, fput_needed); 664 fdput(f);
664 return error; 665 return error;
665} 666}
666 667
@@ -791,3 +792,183 @@ EXPORT_SYMBOL(generic_getxattr);
791EXPORT_SYMBOL(generic_listxattr); 792EXPORT_SYMBOL(generic_listxattr);
792EXPORT_SYMBOL(generic_setxattr); 793EXPORT_SYMBOL(generic_setxattr);
793EXPORT_SYMBOL(generic_removexattr); 794EXPORT_SYMBOL(generic_removexattr);
795
796/*
797 * Allocate new xattr and copy in the value; but leave the name to callers.
798 */
799struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
800{
801 struct simple_xattr *new_xattr;
802 size_t len;
803
804 /* wrap around? */
805 len = sizeof(*new_xattr) + size;
806 if (len <= sizeof(*new_xattr))
807 return NULL;
808
809 new_xattr = kmalloc(len, GFP_KERNEL);
810 if (!new_xattr)
811 return NULL;
812
813 new_xattr->size = size;
814 memcpy(new_xattr->value, value, size);
815 return new_xattr;
816}
817
818/*
819 * xattr GET operation for in-memory/pseudo filesystems
820 */
821int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
822 void *buffer, size_t size)
823{
824 struct simple_xattr *xattr;
825 int ret = -ENODATA;
826
827 spin_lock(&xattrs->lock);
828 list_for_each_entry(xattr, &xattrs->head, list) {
829 if (strcmp(name, xattr->name))
830 continue;
831
832 ret = xattr->size;
833 if (buffer) {
834 if (size < xattr->size)
835 ret = -ERANGE;
836 else
837 memcpy(buffer, xattr->value, xattr->size);
838 }
839 break;
840 }
841 spin_unlock(&xattrs->lock);
842 return ret;
843}
844
845static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
846 const void *value, size_t size, int flags)
847{
848 struct simple_xattr *xattr;
849 struct simple_xattr *new_xattr = NULL;
850 int err = 0;
851
852 /* value == NULL means remove */
853 if (value) {
854 new_xattr = simple_xattr_alloc(value, size);
855 if (!new_xattr)
856 return -ENOMEM;
857
858 new_xattr->name = kstrdup(name, GFP_KERNEL);
859 if (!new_xattr->name) {
860 kfree(new_xattr);
861 return -ENOMEM;
862 }
863 }
864
865 spin_lock(&xattrs->lock);
866 list_for_each_entry(xattr, &xattrs->head, list) {
867 if (!strcmp(name, xattr->name)) {
868 if (flags & XATTR_CREATE) {
869 xattr = new_xattr;
870 err = -EEXIST;
871 } else if (new_xattr) {
872 list_replace(&xattr->list, &new_xattr->list);
873 } else {
874 list_del(&xattr->list);
875 }
876 goto out;
877 }
878 }
879 if (flags & XATTR_REPLACE) {
880 xattr = new_xattr;
881 err = -ENODATA;
882 } else {
883 list_add(&new_xattr->list, &xattrs->head);
884 xattr = NULL;
885 }
886out:
887 spin_unlock(&xattrs->lock);
888 if (xattr) {
889 kfree(xattr->name);
890 kfree(xattr);
891 }
892 return err;
893
894}
895
896/**
897 * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
898 * @xattrs: target simple_xattr list
899 * @name: name of the new extended attribute
900 * @value: value of the new xattr. If %NULL, will remove the attribute
901 * @size: size of the new xattr
902 * @flags: %XATTR_{CREATE|REPLACE}
903 *
904 * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
905 * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
906 * otherwise, fails with -ENODATA.
907 *
908 * Returns 0 on success, -errno on failure.
909 */
910int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
911 const void *value, size_t size, int flags)
912{
913 if (size == 0)
914 value = ""; /* empty EA, do not remove */
915 return __simple_xattr_set(xattrs, name, value, size, flags);
916}
917
918/*
919 * xattr REMOVE operation for in-memory/pseudo filesystems
920 */
921int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
922{
923 return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
924}
925
926static bool xattr_is_trusted(const char *name)
927{
928 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
929}
930
931/*
932 * xattr LIST operation for in-memory/pseudo filesystems
933 */
934ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
935 size_t size)
936{
937 bool trusted = capable(CAP_SYS_ADMIN);
938 struct simple_xattr *xattr;
939 size_t used = 0;
940
941 spin_lock(&xattrs->lock);
942 list_for_each_entry(xattr, &xattrs->head, list) {
943 size_t len;
944
945 /* skip "trusted." attributes for unprivileged callers */
946 if (!trusted && xattr_is_trusted(xattr->name))
947 continue;
948
949 len = strlen(xattr->name) + 1;
950 used += len;
951 if (buffer) {
952 if (size < used) {
953 used = -ERANGE;
954 break;
955 }
956 memcpy(buffer, xattr->name, len);
957 buffer += len;
958 }
959 }
960 spin_unlock(&xattrs->lock);
961
962 return used;
963}
964
965/*
966 * Adds an extended attribute to the list
967 */
968void simple_xattr_list_add(struct simple_xattrs *xattrs,
969 struct simple_xattr *new_xattr)
970{
971 spin_lock(&xattrs->lock);
972 list_add(&new_xattr->list, &xattrs->head);
973 spin_unlock(&xattrs->lock);
974}
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 69d06b07b169..9fbea87fdb6e 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -9,13 +9,72 @@
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h> 11#include <linux/gfp.h>
12#include <linux/user_namespace.h>
12 13
14/*
15 * Fix up the uids and gids in posix acl extended attributes in place.
16 */
17static void posix_acl_fix_xattr_userns(
18 struct user_namespace *to, struct user_namespace *from,
19 void *value, size_t size)
20{
21 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
22 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
23 int count;
24 kuid_t uid;
25 kgid_t gid;
26
27 if (!value)
28 return;
29 if (size < sizeof(posix_acl_xattr_header))
30 return;
31 if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
32 return;
33
34 count = posix_acl_xattr_count(size);
35 if (count < 0)
36 return;
37 if (count == 0)
38 return;
39
40 for (end = entry + count; entry != end; entry++) {
41 switch(le16_to_cpu(entry->e_tag)) {
42 case ACL_USER:
43 uid = make_kuid(from, le32_to_cpu(entry->e_id));
44 entry->e_id = cpu_to_le32(from_kuid(to, uid));
45 break;
46 case ACL_GROUP:
47 gid = make_kgid(from, le32_to_cpu(entry->e_id));
48 entry->e_id = cpu_to_le32(from_kgid(to, gid));
49 break;
50 default:
51 break;
52 }
53 }
54}
55
56void posix_acl_fix_xattr_from_user(void *value, size_t size)
57{
58 struct user_namespace *user_ns = current_user_ns();
59 if (user_ns == &init_user_ns)
60 return;
61 posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
62}
63
64void posix_acl_fix_xattr_to_user(void *value, size_t size)
65{
66 struct user_namespace *user_ns = current_user_ns();
67 if (user_ns == &init_user_ns)
68 return;
69 posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
70}
13 71
14/* 72/*
15 * Convert from extended attribute to in-memory representation. 73 * Convert from extended attribute to in-memory representation.
16 */ 74 */
17struct posix_acl * 75struct posix_acl *
18posix_acl_from_xattr(const void *value, size_t size) 76posix_acl_from_xattr(struct user_namespace *user_ns,
77 const void *value, size_t size)
19{ 78{
20 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; 79 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
21 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; 80 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
@@ -50,12 +109,21 @@ posix_acl_from_xattr(const void *value, size_t size)
50 case ACL_GROUP_OBJ: 109 case ACL_GROUP_OBJ:
51 case ACL_MASK: 110 case ACL_MASK:
52 case ACL_OTHER: 111 case ACL_OTHER:
53 acl_e->e_id = ACL_UNDEFINED_ID;
54 break; 112 break;
55 113
56 case ACL_USER: 114 case ACL_USER:
115 acl_e->e_uid =
116 make_kuid(user_ns,
117 le32_to_cpu(entry->e_id));
118 if (!uid_valid(acl_e->e_uid))
119 goto fail;
120 break;
57 case ACL_GROUP: 121 case ACL_GROUP:
58 acl_e->e_id = le32_to_cpu(entry->e_id); 122 acl_e->e_gid =
123 make_kgid(user_ns,
124 le32_to_cpu(entry->e_id));
125 if (!gid_valid(acl_e->e_gid))
126 goto fail;
59 break; 127 break;
60 128
61 default: 129 default:
@@ -74,7 +142,8 @@ EXPORT_SYMBOL (posix_acl_from_xattr);
74 * Convert from in-memory to extended attribute representation. 142 * Convert from in-memory to extended attribute representation.
75 */ 143 */
76int 144int
77posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) 145posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
146 void *buffer, size_t size)
78{ 147{
79 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; 148 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
80 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; 149 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
@@ -89,9 +158,22 @@ posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size)
89 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); 158 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
90 159
91 for (n=0; n < acl->a_count; n++, ext_entry++) { 160 for (n=0; n < acl->a_count; n++, ext_entry++) {
92 ext_entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); 161 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
93 ext_entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); 162 ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
94 ext_entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); 163 ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
164 switch(acl_e->e_tag) {
165 case ACL_USER:
166 ext_entry->e_id =
167 cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
168 break;
169 case ACL_GROUP:
170 ext_entry->e_id =
171 cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
172 break;
173 default:
174 ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
175 break;
176 }
95 } 177 }
96 return real_size; 178 return real_size;
97} 179}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index ac702a6eab9b..1d32f1d52763 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name,
337 if (acl == NULL) 337 if (acl == NULL)
338 return -ENODATA; 338 return -ENODATA;
339 339
340 error = posix_acl_to_xattr(acl, value, size); 340 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
341 posix_acl_release(acl); 341 posix_acl_release(acl);
342 342
343 return error; 343 return error;
@@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
361 if (!value) 361 if (!value)
362 goto set_acl; 362 goto set_acl;
363 363
364 acl = posix_acl_from_xattr(value, size); 364 acl = posix_acl_from_xattr(&init_user_ns, value, size);
365 if (!acl) { 365 if (!acl) {
366 /* 366 /*
367 * acl_set_file(3) may request that we set default ACLs with 367 * acl_set_file(3) may request that we set default ACLs with
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d7a9dd735e1e..933b7930b863 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -96,6 +96,7 @@ xfs_buf_lru_add(
96 atomic_inc(&bp->b_hold); 96 atomic_inc(&bp->b_hold);
97 list_add_tail(&bp->b_lru, &btp->bt_lru); 97 list_add_tail(&bp->b_lru, &btp->bt_lru);
98 btp->bt_lru_nr++; 98 btp->bt_lru_nr++;
99 bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
99 } 100 }
100 spin_unlock(&btp->bt_lru_lock); 101 spin_unlock(&btp->bt_lru_lock);
101} 102}
@@ -154,7 +155,8 @@ xfs_buf_stale(
154 struct xfs_buftarg *btp = bp->b_target; 155 struct xfs_buftarg *btp = bp->b_target;
155 156
156 spin_lock(&btp->bt_lru_lock); 157 spin_lock(&btp->bt_lru_lock);
157 if (!list_empty(&bp->b_lru)) { 158 if (!list_empty(&bp->b_lru) &&
159 !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
158 list_del_init(&bp->b_lru); 160 list_del_init(&bp->b_lru);
159 btp->bt_lru_nr--; 161 btp->bt_lru_nr--;
160 atomic_dec(&bp->b_hold); 162 atomic_dec(&bp->b_hold);
@@ -1501,6 +1503,7 @@ xfs_buftarg_shrink(
1501 */ 1503 */
1502 list_move(&bp->b_lru, &dispose); 1504 list_move(&bp->b_lru, &dispose);
1503 btp->bt_lru_nr--; 1505 btp->bt_lru_nr--;
1506 bp->b_lru_flags |= _XBF_LRU_DISPOSE;
1504 } 1507 }
1505 spin_unlock(&btp->bt_lru_lock); 1508 spin_unlock(&btp->bt_lru_lock);
1506 1509
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d03b73b9604e..7c0b6a0a1557 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -38,27 +38,28 @@ typedef enum {
38 XBRW_ZERO = 3, /* Zero target memory */ 38 XBRW_ZERO = 3, /* Zero target memory */
39} xfs_buf_rw_t; 39} xfs_buf_rw_t;
40 40
41#define XBF_READ (1 << 0) /* buffer intended for reading from device */ 41#define XBF_READ (1 << 0) /* buffer intended for reading from device */
42#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ 42#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
43#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ 43#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
44#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 44#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
45#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 45#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
46#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ 46#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
47 47
48/* I/O hints for the BIO layer */ 48/* I/O hints for the BIO layer */
49#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ 49#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
50#define XBF_FUA (1 << 11)/* force cache write through mode */ 50#define XBF_FUA (1 << 11)/* force cache write through mode */
51#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ 51#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
52 52
53/* flags used only as arguments to access routines */ 53/* flags used only as arguments to access routines */
54#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ 54#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
55#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ 55#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
56 56
57/* flags used only internally */ 57/* flags used only internally */
58#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ 58#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
59#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 59#define _XBF_KMEM (1 << 21)/* backed by heap memory */
60#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ 60#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
61#define _XBF_COMPOUND (1 << 23)/* compound buffer */ 61#define _XBF_COMPOUND (1 << 23)/* compound buffer */
62#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
62 63
63typedef unsigned int xfs_buf_flags_t; 64typedef unsigned int xfs_buf_flags_t;
64 65
@@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t;
72 { XBF_SYNCIO, "SYNCIO" }, \ 73 { XBF_SYNCIO, "SYNCIO" }, \
73 { XBF_FUA, "FUA" }, \ 74 { XBF_FUA, "FUA" }, \
74 { XBF_FLUSH, "FLUSH" }, \ 75 { XBF_FLUSH, "FLUSH" }, \
75 { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ 76 { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
76 { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ 77 { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
77 { _XBF_PAGES, "PAGES" }, \ 78 { _XBF_PAGES, "PAGES" }, \
78 { _XBF_KMEM, "KMEM" }, \ 79 { _XBF_KMEM, "KMEM" }, \
79 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 80 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
80 { _XBF_COMPOUND, "COMPOUND" } 81 { _XBF_COMPOUND, "COMPOUND" }, \
82 { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
81 83
82typedef struct xfs_buftarg { 84typedef struct xfs_buftarg {
83 dev_t bt_dev; 85 dev_t bt_dev;
@@ -124,7 +126,12 @@ typedef struct xfs_buf {
124 xfs_buf_flags_t b_flags; /* status flags */ 126 xfs_buf_flags_t b_flags; /* status flags */
125 struct semaphore b_sema; /* semaphore for lockables */ 127 struct semaphore b_sema; /* semaphore for lockables */
126 128
129 /*
130 * concurrent access to b_lru and b_lru_flags are protected by
131 * bt_lru_lock and not by b_sema
132 */
127 struct list_head b_lru; /* lru list */ 133 struct list_head b_lru; /* lru list */
134 xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
128 wait_queue_head_t b_waiters; /* unpin waiters */ 135 wait_queue_head_t b_waiters; /* unpin waiters */
129 struct list_head b_list; 136 struct list_head b_list;
130 struct xfs_perag *b_pag; /* contains rbtree root */ 137 struct xfs_perag *b_pag; /* contains rbtree root */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e00de08dc8ac..b9b8646e62db 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -48,44 +48,44 @@ xfs_swapext(
48 xfs_swapext_t *sxp) 48 xfs_swapext_t *sxp)
49{ 49{
50 xfs_inode_t *ip, *tip; 50 xfs_inode_t *ip, *tip;
51 struct file *file, *tmp_file; 51 struct fd f, tmp;
52 int error = 0; 52 int error = 0;
53 53
54 /* Pull information for the target fd */ 54 /* Pull information for the target fd */
55 file = fget((int)sxp->sx_fdtarget); 55 f = fdget((int)sxp->sx_fdtarget);
56 if (!file) { 56 if (!f.file) {
57 error = XFS_ERROR(EINVAL); 57 error = XFS_ERROR(EINVAL);
58 goto out; 58 goto out;
59 } 59 }
60 60
61 if (!(file->f_mode & FMODE_WRITE) || 61 if (!(f.file->f_mode & FMODE_WRITE) ||
62 !(file->f_mode & FMODE_READ) || 62 !(f.file->f_mode & FMODE_READ) ||
63 (file->f_flags & O_APPEND)) { 63 (f.file->f_flags & O_APPEND)) {
64 error = XFS_ERROR(EBADF); 64 error = XFS_ERROR(EBADF);
65 goto out_put_file; 65 goto out_put_file;
66 } 66 }
67 67
68 tmp_file = fget((int)sxp->sx_fdtmp); 68 tmp = fdget((int)sxp->sx_fdtmp);
69 if (!tmp_file) { 69 if (!tmp.file) {
70 error = XFS_ERROR(EINVAL); 70 error = XFS_ERROR(EINVAL);
71 goto out_put_file; 71 goto out_put_file;
72 } 72 }
73 73
74 if (!(tmp_file->f_mode & FMODE_WRITE) || 74 if (!(tmp.file->f_mode & FMODE_WRITE) ||
75 !(tmp_file->f_mode & FMODE_READ) || 75 !(tmp.file->f_mode & FMODE_READ) ||
76 (tmp_file->f_flags & O_APPEND)) { 76 (tmp.file->f_flags & O_APPEND)) {
77 error = XFS_ERROR(EBADF); 77 error = XFS_ERROR(EBADF);
78 goto out_put_tmp_file; 78 goto out_put_tmp_file;
79 } 79 }
80 80
81 if (IS_SWAPFILE(file->f_path.dentry->d_inode) || 81 if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
82 IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { 82 IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
83 error = XFS_ERROR(EINVAL); 83 error = XFS_ERROR(EINVAL);
84 goto out_put_tmp_file; 84 goto out_put_tmp_file;
85 } 85 }
86 86
87 ip = XFS_I(file->f_path.dentry->d_inode); 87 ip = XFS_I(f.file->f_path.dentry->d_inode);
88 tip = XFS_I(tmp_file->f_path.dentry->d_inode); 88 tip = XFS_I(tmp.file->f_path.dentry->d_inode);
89 89
90 if (ip->i_mount != tip->i_mount) { 90 if (ip->i_mount != tip->i_mount) {
91 error = XFS_ERROR(EINVAL); 91 error = XFS_ERROR(EINVAL);
@@ -105,9 +105,9 @@ xfs_swapext(
105 error = xfs_swap_extents(ip, tip, sxp); 105 error = xfs_swap_extents(ip, tip, sxp);
106 106
107 out_put_tmp_file: 107 out_put_tmp_file:
108 fput(tmp_file); 108 fdput(tmp);
109 out_put_file: 109 out_put_file:
110 fput(file); 110 fdput(f);
111 out: 111 out:
112 return error; 112 return error;
113} 113}
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 42679223a0fd..8c6d1d70278c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -189,6 +189,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
189 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; 189 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
190 struct inode *inode = NULL; 190 struct inode *inode = NULL;
191 191
192 if (fh_len < xfs_fileid_length(fileid_type))
193 return NULL;
194
192 switch (fileid_type) { 195 switch (fileid_type) {
193 case FILEID_INO32_GEN_PARENT: 196 case FILEID_INO32_GEN_PARENT:
194 inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, 197 inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 56afcdb2377d..aa473fa640a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
36 36
37#include <linux/dcache.h> 37#include <linux/dcache.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/pagevec.h>
39 40
40static const struct vm_operations_struct xfs_file_vm_ops; 41static const struct vm_operations_struct xfs_file_vm_ops;
41 42
@@ -939,7 +940,6 @@ xfs_file_mmap(
939 struct vm_area_struct *vma) 940 struct vm_area_struct *vma)
940{ 941{
941 vma->vm_ops = &xfs_file_vm_ops; 942 vma->vm_ops = &xfs_file_vm_ops;
942 vma->vm_flags |= VM_CAN_NONLINEAR;
943 943
944 file_accessed(filp); 944 file_accessed(filp);
945 return 0; 945 return 0;
@@ -959,17 +959,232 @@ xfs_vm_page_mkwrite(
959 return block_page_mkwrite(vma, vmf, xfs_get_blocks); 959 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
960} 960}
961 961
962/*
963 * This type is designed to indicate the type of offset we would like
964 * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
965 */
966enum {
967 HOLE_OFF = 0,
968 DATA_OFF,
969};
970
971/*
972 * Lookup the desired type of offset from the given page.
973 *
974 * On success, return true and the offset argument will point to the
975 * start of the region that was found. Otherwise this function will
976 * return false and keep the offset argument unchanged.
977 */
978STATIC bool
979xfs_lookup_buffer_offset(
980 struct page *page,
981 loff_t *offset,
982 unsigned int type)
983{
984 loff_t lastoff = page_offset(page);
985 bool found = false;
986 struct buffer_head *bh, *head;
987
988 bh = head = page_buffers(page);
989 do {
990 /*
991 * Unwritten extents that have data in the page
992 * cache covering them can be identified by the
993 * BH_Unwritten state flag. Pages with multiple
994 * buffers might have a mix of holes, data and
995 * unwritten extents - any buffer with valid
996 * data in it should have BH_Uptodate flag set
997 * on it.
998 */
999 if (buffer_unwritten(bh) ||
1000 buffer_uptodate(bh)) {
1001 if (type == DATA_OFF)
1002 found = true;
1003 } else {
1004 if (type == HOLE_OFF)
1005 found = true;
1006 }
1007
1008 if (found) {
1009 *offset = lastoff;
1010 break;
1011 }
1012 lastoff += bh->b_size;
1013 } while ((bh = bh->b_this_page) != head);
1014
1015 return found;
1016}
1017
1018/*
1019 * This routine is called to find out and return a data or hole offset
1020 * from the page cache for unwritten extents according to the desired
1021 * type for xfs_seek_data() or xfs_seek_hole().
1022 *
1023 * The argument offset is used to tell where we start to search from the
1024 * page cache. Map is used to figure out the end points of the range to
1025 * lookup pages.
1026 *
1027 * Return true if the desired type of offset was found, and the argument
1028 * offset is filled with that address. Otherwise, return false and keep
1029 * offset unchanged.
1030 */
1031STATIC bool
1032xfs_find_get_desired_pgoff(
1033 struct inode *inode,
1034 struct xfs_bmbt_irec *map,
1035 unsigned int type,
1036 loff_t *offset)
1037{
1038 struct xfs_inode *ip = XFS_I(inode);
1039 struct xfs_mount *mp = ip->i_mount;
1040 struct pagevec pvec;
1041 pgoff_t index;
1042 pgoff_t end;
1043 loff_t endoff;
1044 loff_t startoff = *offset;
1045 loff_t lastoff = startoff;
1046 bool found = false;
1047
1048 pagevec_init(&pvec, 0);
1049
1050 index = startoff >> PAGE_CACHE_SHIFT;
1051 endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
1052 end = endoff >> PAGE_CACHE_SHIFT;
1053 do {
1054 int want;
1055 unsigned nr_pages;
1056 unsigned int i;
1057
1058 want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
1059 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
1060 want);
1061 /*
1062 * No page mapped into given range. If we are searching holes
1063 * and if this is the first time we got into the loop, it means
1064 * that the given offset is landed in a hole, return it.
1065 *
1066 * If we have already stepped through some block buffers to find
1067 * holes but they all contains data. In this case, the last
1068 * offset is already updated and pointed to the end of the last
1069 * mapped page, if it does not reach the endpoint to search,
1070 * that means there should be a hole between them.
1071 */
1072 if (nr_pages == 0) {
1073 /* Data search found nothing */
1074 if (type == DATA_OFF)
1075 break;
1076
1077 ASSERT(type == HOLE_OFF);
1078 if (lastoff == startoff || lastoff < endoff) {
1079 found = true;
1080 *offset = lastoff;
1081 }
1082 break;
1083 }
1084
1085 /*
1086 * At lease we found one page. If this is the first time we
1087 * step into the loop, and if the first page index offset is
1088 * greater than the given search offset, a hole was found.
1089 */
1090 if (type == HOLE_OFF && lastoff == startoff &&
1091 lastoff < page_offset(pvec.pages[0])) {
1092 found = true;
1093 break;
1094 }
1095
1096 for (i = 0; i < nr_pages; i++) {
1097 struct page *page = pvec.pages[i];
1098 loff_t b_offset;
1099
1100 /*
1101 * At this point, the page may be truncated or
1102 * invalidated (changing page->mapping to NULL),
1103 * or even swizzled back from swapper_space to tmpfs
1104 * file mapping. However, page->index will not change
1105 * because we have a reference on the page.
1106 *
1107 * Searching done if the page index is out of range.
1108 * If the current offset is not reaches the end of
1109 * the specified search range, there should be a hole
1110 * between them.
1111 */
1112 if (page->index > end) {
1113 if (type == HOLE_OFF && lastoff < endoff) {
1114 *offset = lastoff;
1115 found = true;
1116 }
1117 goto out;
1118 }
1119
1120 lock_page(page);
1121 /*
1122 * Page truncated or invalidated(page->mapping == NULL).
1123 * We can freely skip it and proceed to check the next
1124 * page.
1125 */
1126 if (unlikely(page->mapping != inode->i_mapping)) {
1127 unlock_page(page);
1128 continue;
1129 }
1130
1131 if (!page_has_buffers(page)) {
1132 unlock_page(page);
1133 continue;
1134 }
1135
1136 found = xfs_lookup_buffer_offset(page, &b_offset, type);
1137 if (found) {
1138 /*
1139 * The found offset may be less than the start
1140 * point to search if this is the first time to
1141 * come here.
1142 */
1143 *offset = max_t(loff_t, startoff, b_offset);
1144 unlock_page(page);
1145 goto out;
1146 }
1147
1148 /*
1149 * We either searching data but nothing was found, or
1150 * searching hole but found a data buffer. In either
1151 * case, probably the next page contains the desired
1152 * things, update the last offset to it so.
1153 */
1154 lastoff = page_offset(page) + PAGE_SIZE;
1155 unlock_page(page);
1156 }
1157
1158 /*
1159 * The number of returned pages less than our desired, search
1160 * done. In this case, nothing was found for searching data,
1161 * but we found a hole behind the last offset.
1162 */
1163 if (nr_pages < want) {
1164 if (type == HOLE_OFF) {
1165 *offset = lastoff;
1166 found = true;
1167 }
1168 break;
1169 }
1170
1171 index = pvec.pages[i - 1]->index + 1;
1172 pagevec_release(&pvec);
1173 } while (index <= end);
1174
1175out:
1176 pagevec_release(&pvec);
1177 return found;
1178}
1179
962STATIC loff_t 1180STATIC loff_t
963xfs_seek_data( 1181xfs_seek_data(
964 struct file *file, 1182 struct file *file,
965 loff_t start, 1183 loff_t start)
966 u32 type)
967{ 1184{
968 struct inode *inode = file->f_mapping->host; 1185 struct inode *inode = file->f_mapping->host;
969 struct xfs_inode *ip = XFS_I(inode); 1186 struct xfs_inode *ip = XFS_I(inode);
970 struct xfs_mount *mp = ip->i_mount; 1187 struct xfs_mount *mp = ip->i_mount;
971 struct xfs_bmbt_irec map[2];
972 int nmap = 2;
973 loff_t uninitialized_var(offset); 1188 loff_t uninitialized_var(offset);
974 xfs_fsize_t isize; 1189 xfs_fsize_t isize;
975 xfs_fileoff_t fsbno; 1190 xfs_fileoff_t fsbno;
@@ -985,36 +1200,74 @@ xfs_seek_data(
985 goto out_unlock; 1200 goto out_unlock;
986 } 1201 }
987 1202
988 fsbno = XFS_B_TO_FSBT(mp, start);
989
990 /* 1203 /*
991 * Try to read extents from the first block indicated 1204 * Try to read extents from the first block indicated
992 * by fsbno to the end block of the file. 1205 * by fsbno to the end block of the file.
993 */ 1206 */
1207 fsbno = XFS_B_TO_FSBT(mp, start);
994 end = XFS_B_TO_FSB(mp, isize); 1208 end = XFS_B_TO_FSB(mp, isize);
1209 for (;;) {
1210 struct xfs_bmbt_irec map[2];
1211 int nmap = 2;
1212 unsigned int i;
995 1213
996 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, 1214 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
997 XFS_BMAPI_ENTIRE); 1215 XFS_BMAPI_ENTIRE);
998 if (error) 1216 if (error)
999 goto out_unlock; 1217 goto out_unlock;
1000 1218
1001 /* 1219 /* No extents at given offset, must be beyond EOF */
1002 * Treat unwritten extent as data extent since it might 1220 if (nmap == 0) {
1003 * contains dirty data in page cache. 1221 error = ENXIO;
1004 */ 1222 goto out_unlock;
1005 if (map[0].br_startblock != HOLESTARTBLOCK) { 1223 }
1006 offset = max_t(loff_t, start, 1224
1007 XFS_FSB_TO_B(mp, map[0].br_startoff)); 1225 for (i = 0; i < nmap; i++) {
1008 } else { 1226 offset = max_t(loff_t, start,
1227 XFS_FSB_TO_B(mp, map[i].br_startoff));
1228
1229 /* Landed in a data extent */
1230 if (map[i].br_startblock == DELAYSTARTBLOCK ||
1231 (map[i].br_state == XFS_EXT_NORM &&
1232 !isnullstartblock(map[i].br_startblock)))
1233 goto out;
1234
1235 /*
1236 * Landed in an unwritten extent, try to search data
1237 * from page cache.
1238 */
1239 if (map[i].br_state == XFS_EXT_UNWRITTEN) {
1240 if (xfs_find_get_desired_pgoff(inode, &map[i],
1241 DATA_OFF, &offset))
1242 goto out;
1243 }
1244 }
1245
1246 /*
1247 * map[0] is hole or its an unwritten extent but
1248 * without data in page cache. Probably means that
1249 * we are reading after EOF if nothing in map[1].
1250 */
1009 if (nmap == 1) { 1251 if (nmap == 1) {
1010 error = ENXIO; 1252 error = ENXIO;
1011 goto out_unlock; 1253 goto out_unlock;
1012 } 1254 }
1013 1255
1014 offset = max_t(loff_t, start, 1256 ASSERT(i > 1);
1015 XFS_FSB_TO_B(mp, map[1].br_startoff)); 1257
1258 /*
1259 * Nothing was found, proceed to the next round of search
1260 * if reading offset not beyond or hit EOF.
1261 */
1262 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1263 start = XFS_FSB_TO_B(mp, fsbno);
1264 if (start >= isize) {
1265 error = ENXIO;
1266 goto out_unlock;
1267 }
1016 } 1268 }
1017 1269
1270out:
1018 if (offset != file->f_pos) 1271 if (offset != file->f_pos)
1019 file->f_pos = offset; 1272 file->f_pos = offset;
1020 1273
@@ -1029,16 +1282,15 @@ out_unlock:
1029STATIC loff_t 1282STATIC loff_t
1030xfs_seek_hole( 1283xfs_seek_hole(
1031 struct file *file, 1284 struct file *file,
1032 loff_t start, 1285 loff_t start)
1033 u32 type)
1034{ 1286{
1035 struct inode *inode = file->f_mapping->host; 1287 struct inode *inode = file->f_mapping->host;
1036 struct xfs_inode *ip = XFS_I(inode); 1288 struct xfs_inode *ip = XFS_I(inode);
1037 struct xfs_mount *mp = ip->i_mount; 1289 struct xfs_mount *mp = ip->i_mount;
1038 loff_t uninitialized_var(offset); 1290 loff_t uninitialized_var(offset);
1039 loff_t holeoff;
1040 xfs_fsize_t isize; 1291 xfs_fsize_t isize;
1041 xfs_fileoff_t fsbno; 1292 xfs_fileoff_t fsbno;
1293 xfs_filblks_t end;
1042 uint lock; 1294 uint lock;
1043 int error; 1295 int error;
1044 1296
@@ -1054,21 +1306,77 @@ xfs_seek_hole(
1054 } 1306 }
1055 1307
1056 fsbno = XFS_B_TO_FSBT(mp, start); 1308 fsbno = XFS_B_TO_FSBT(mp, start);
1057 error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); 1309 end = XFS_B_TO_FSB(mp, isize);
1058 if (error) 1310
1059 goto out_unlock; 1311 for (;;) {
1312 struct xfs_bmbt_irec map[2];
1313 int nmap = 2;
1314 unsigned int i;
1315
1316 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
1317 XFS_BMAPI_ENTIRE);
1318 if (error)
1319 goto out_unlock;
1320
1321 /* No extents at given offset, must be beyond EOF */
1322 if (nmap == 0) {
1323 error = ENXIO;
1324 goto out_unlock;
1325 }
1326
1327 for (i = 0; i < nmap; i++) {
1328 offset = max_t(loff_t, start,
1329 XFS_FSB_TO_B(mp, map[i].br_startoff));
1330
1331 /* Landed in a hole */
1332 if (map[i].br_startblock == HOLESTARTBLOCK)
1333 goto out;
1334
1335 /*
1336 * Landed in an unwritten extent, try to search hole
1337 * from page cache.
1338 */
1339 if (map[i].br_state == XFS_EXT_UNWRITTEN) {
1340 if (xfs_find_get_desired_pgoff(inode, &map[i],
1341 HOLE_OFF, &offset))
1342 goto out;
1343 }
1344 }
1345
1346 /*
1347 * map[0] contains data or its unwritten but contains
1348 * data in page cache, probably means that we are
1349 * reading after EOF. We should fix offset to point
1350 * to the end of the file(i.e., there is an implicit
1351 * hole at the end of any file).
1352 */
1353 if (nmap == 1) {
1354 offset = isize;
1355 break;
1356 }
1357
1358 ASSERT(i > 1);
1060 1359
1061 holeoff = XFS_FSB_TO_B(mp, fsbno);
1062 if (holeoff <= start)
1063 offset = start;
1064 else {
1065 /* 1360 /*
1066 * xfs_bmap_first_unused() could return a value bigger than 1361 * Both mappings contains data, proceed to the next round of
1067 * isize if there are no more holes past the supplied offset. 1362 * search if the current reading offset not beyond or hit EOF.
1068 */ 1363 */
1069 offset = min_t(loff_t, holeoff, isize); 1364 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1365 start = XFS_FSB_TO_B(mp, fsbno);
1366 if (start >= isize) {
1367 offset = isize;
1368 break;
1369 }
1070 } 1370 }
1071 1371
1372out:
1373 /*
1374 * At this point, we must have found a hole. However, the returned
1375 * offset may be bigger than the file size as it may be aligned to
1376 * page boundary for unwritten extents, we need to deal with this
1377 * situation in particular.
1378 */
1379 offset = min_t(loff_t, offset, isize);
1072 if (offset != file->f_pos) 1380 if (offset != file->f_pos)
1073 file->f_pos = offset; 1381 file->f_pos = offset;
1074 1382
@@ -1092,9 +1400,9 @@ xfs_file_llseek(
1092 case SEEK_SET: 1400 case SEEK_SET:
1093 return generic_file_llseek(file, offset, origin); 1401 return generic_file_llseek(file, offset, origin);
1094 case SEEK_DATA: 1402 case SEEK_DATA:
1095 return xfs_seek_data(file, offset, origin); 1403 return xfs_seek_data(file, offset);
1096 case SEEK_HOLE: 1404 case SEEK_HOLE:
1097 return xfs_seek_hole(file, offset, origin); 1405 return xfs_seek_hole(file, offset);
1098 default: 1406 default:
1099 return -EINVAL; 1407 return -EINVAL;
1100 } 1408 }
@@ -1134,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = {
1134static const struct vm_operations_struct xfs_file_vm_ops = { 1442static const struct vm_operations_struct xfs_file_vm_ops = {
1135 .fault = filemap_fault, 1443 .fault = filemap_fault,
1136 .page_mkwrite = xfs_vm_page_mkwrite, 1444 .page_mkwrite = xfs_vm_page_mkwrite,
1445 .remap_pages = generic_file_remap_pages,
1137}; 1446};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5aceb3f8ecd6..445bf1aef31c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -431,7 +431,7 @@ xfs_ialloc_next_ag(
431 431
432 spin_lock(&mp->m_agirotor_lock); 432 spin_lock(&mp->m_agirotor_lock);
433 agno = mp->m_agirotor; 433 agno = mp->m_agirotor;
434 if (++mp->m_agirotor == mp->m_maxagi) 434 if (++mp->m_agirotor >= mp->m_maxagi)
435 mp->m_agirotor = 0; 435 mp->m_agirotor = 0;
436 spin_unlock(&mp->m_agirotor_lock); 436 spin_unlock(&mp->m_agirotor_lock);
437 437
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0e0232c3b6d9..8305f2ac6773 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -70,16 +70,16 @@ xfs_find_handle(
70 int hsize; 70 int hsize;
71 xfs_handle_t handle; 71 xfs_handle_t handle;
72 struct inode *inode; 72 struct inode *inode;
73 struct file *file = NULL; 73 struct fd f;
74 struct path path; 74 struct path path;
75 int error; 75 int error;
76 struct xfs_inode *ip; 76 struct xfs_inode *ip;
77 77
78 if (cmd == XFS_IOC_FD_TO_HANDLE) { 78 if (cmd == XFS_IOC_FD_TO_HANDLE) {
79 file = fget(hreq->fd); 79 f = fdget(hreq->fd);
80 if (!file) 80 if (!f.file)
81 return -EBADF; 81 return -EBADF;
82 inode = file->f_path.dentry->d_inode; 82 inode = f.file->f_path.dentry->d_inode;
83 } else { 83 } else {
84 error = user_lpath((const char __user *)hreq->path, &path); 84 error = user_lpath((const char __user *)hreq->path, &path);
85 if (error) 85 if (error)
@@ -134,7 +134,7 @@ xfs_find_handle(
134 134
135 out_put: 135 out_put:
136 if (cmd == XFS_IOC_FD_TO_HANDLE) 136 if (cmd == XFS_IOC_FD_TO_HANDLE)
137 fput(file); 137 fdput(f);
138 else 138 else
139 path_put(&path); 139 path_put(&path);
140 return error; 140 return error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 29c2f83d4147..b2bd3a0e6376 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -440,7 +440,7 @@ xfs_initialize_perag(
440 xfs_agnumber_t agcount, 440 xfs_agnumber_t agcount,
441 xfs_agnumber_t *maxagi) 441 xfs_agnumber_t *maxagi)
442{ 442{
443 xfs_agnumber_t index, max_metadata; 443 xfs_agnumber_t index;
444 xfs_agnumber_t first_initialised = 0; 444 xfs_agnumber_t first_initialised = 0;
445 xfs_perag_t *pag; 445 xfs_perag_t *pag;
446 xfs_agino_t agino; 446 xfs_agino_t agino;
@@ -500,43 +500,10 @@ xfs_initialize_perag(
500 else 500 else
501 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 501 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
502 502
503 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 503 if (mp->m_flags & XFS_MOUNT_32BITINODES)
504 /* 504 index = xfs_set_inode32(mp);
505 * Calculate how much should be reserved for inodes to meet 505 else
506 * the max inode percentage. 506 index = xfs_set_inode64(mp);
507 */
508 if (mp->m_maxicount) {
509 __uint64_t icount;
510
511 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
512 do_div(icount, 100);
513 icount += sbp->sb_agblocks - 1;
514 do_div(icount, sbp->sb_agblocks);
515 max_metadata = icount;
516 } else {
517 max_metadata = agcount;
518 }
519
520 for (index = 0; index < agcount; index++) {
521 ino = XFS_AGINO_TO_INO(mp, index, agino);
522 if (ino > XFS_MAXINUMBER_32) {
523 index++;
524 break;
525 }
526
527 pag = xfs_perag_get(mp, index);
528 pag->pagi_inodeok = 1;
529 if (index < max_metadata)
530 pag->pagf_metadata = 1;
531 xfs_perag_put(pag);
532 }
533 } else {
534 for (index = 0; index < agcount; index++) {
535 pag = xfs_perag_get(mp, index);
536 pag->pagi_inodeok = 1;
537 xfs_perag_put(pag);
538 }
539 }
540 507
541 if (maxagi) 508 if (maxagi)
542 *maxagi = index; 509 *maxagi = index;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 05a05a7b6119..deee09e534dc 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -54,12 +54,7 @@ typedef struct xfs_trans_reservations {
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct xlog; 56struct xlog;
57struct xfs_mount_args;
58struct xfs_inode; 57struct xfs_inode;
59struct xfs_bmbt_irec;
60struct xfs_bmap_free;
61struct xfs_extdelta;
62struct xfs_swapext;
63struct xfs_mru_cache; 58struct xfs_mru_cache;
64struct xfs_nameops; 59struct xfs_nameops;
65struct xfs_ail; 60struct xfs_ail;
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index fed504fc2999..71926d630527 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -97,8 +97,7 @@ xfs_fs_set_xstate(
97STATIC int 97STATIC int
98xfs_fs_get_dqblk( 98xfs_fs_get_dqblk(
99 struct super_block *sb, 99 struct super_block *sb,
100 int type, 100 struct kqid qid,
101 qid_t id,
102 struct fs_disk_quota *fdq) 101 struct fs_disk_quota *fdq)
103{ 102{
104 struct xfs_mount *mp = XFS_M(sb); 103 struct xfs_mount *mp = XFS_M(sb);
@@ -108,14 +107,14 @@ xfs_fs_get_dqblk(
108 if (!XFS_IS_QUOTA_ON(mp)) 107 if (!XFS_IS_QUOTA_ON(mp))
109 return -ESRCH; 108 return -ESRCH;
110 109
111 return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); 110 return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
111 xfs_quota_type(qid.type), fdq);
112} 112}
113 113
114STATIC int 114STATIC int
115xfs_fs_set_dqblk( 115xfs_fs_set_dqblk(
116 struct super_block *sb, 116 struct super_block *sb,
117 int type, 117 struct kqid qid,
118 qid_t id,
119 struct fs_disk_quota *fdq) 118 struct fs_disk_quota *fdq)
120{ 119{
121 struct xfs_mount *mp = XFS_M(sb); 120 struct xfs_mount *mp = XFS_M(sb);
@@ -127,7 +126,8 @@ xfs_fs_set_dqblk(
127 if (!XFS_IS_QUOTA_ON(mp)) 126 if (!XFS_IS_QUOTA_ON(mp))
128 return -ESRCH; 127 return -ESRCH;
129 128
130 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 129 return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
130 xfs_quota_type(qid.type), fdq);
131} 131}
132 132
133const struct quotactl_ops xfs_quotactl_operations = { 133const struct quotactl_ops xfs_quotactl_operations = {
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb9f4a2..26a09bd7f975 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool;
88 * unwritten extent conversion */ 88 * unwritten extent conversion */
89#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ 89#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
90#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ 90#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
91#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
92 * XFS_MAXINUMBER_32 */
91#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ 93#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
92#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ 94#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
93#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ 95#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
@@ -120,12 +122,18 @@ mempool_t *xfs_ioend_pool;
120 * in the future, too. 122 * in the future, too.
121 */ 123 */
122enum { 124enum {
123 Opt_barrier, Opt_nobarrier, Opt_err 125 Opt_barrier,
126 Opt_nobarrier,
127 Opt_inode64,
128 Opt_inode32,
129 Opt_err
124}; 130};
125 131
126static const match_table_t tokens = { 132static const match_table_t tokens = {
127 {Opt_barrier, "barrier"}, 133 {Opt_barrier, "barrier"},
128 {Opt_nobarrier, "nobarrier"}, 134 {Opt_nobarrier, "nobarrier"},
135 {Opt_inode64, "inode64"},
136 {Opt_inode32, "inode32"},
129 {Opt_err, NULL} 137 {Opt_err, NULL}
130}; 138};
131 139
@@ -197,7 +205,9 @@ xfs_parseargs(
197 */ 205 */
198 mp->m_flags |= XFS_MOUNT_BARRIER; 206 mp->m_flags |= XFS_MOUNT_BARRIER;
199 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 207 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
208#if !XFS_BIG_INUMS
200 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 209 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
210#endif
201 211
202 /* 212 /*
203 * These can be overridden by the mount option parsing. 213 * These can be overridden by the mount option parsing.
@@ -294,6 +304,8 @@ xfs_parseargs(
294 return EINVAL; 304 return EINVAL;
295 } 305 }
296 dswidth = simple_strtoul(value, &eov, 10); 306 dswidth = simple_strtoul(value, &eov, 10);
307 } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
308 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
297 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 309 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
298 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 310 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
299#if !XFS_BIG_INUMS 311#if !XFS_BIG_INUMS
@@ -492,6 +504,7 @@ xfs_showargs(
492 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 504 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
493 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 505 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
494 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, 506 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
507 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
495 { 0, NULL } 508 { 0, NULL }
496 }; 509 };
497 static struct proc_xfs_info xfs_info_unset[] = { 510 static struct proc_xfs_info xfs_info_unset[] = {
@@ -591,6 +604,80 @@ xfs_max_file_offset(
591 return (((__uint64_t)pagefactor) << bitshift) - 1; 604 return (((__uint64_t)pagefactor) << bitshift) - 1;
592} 605}
593 606
607xfs_agnumber_t
608xfs_set_inode32(struct xfs_mount *mp)
609{
610 xfs_agnumber_t index = 0;
611 xfs_agnumber_t maxagi = 0;
612 xfs_sb_t *sbp = &mp->m_sb;
613 xfs_agnumber_t max_metadata;
614 xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
615 xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
616 xfs_perag_t *pag;
617
618 /* Calculate how much should be reserved for inodes to meet
619 * the max inode percentage.
620 */
621 if (mp->m_maxicount) {
622 __uint64_t icount;
623
624 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
625 do_div(icount, 100);
626 icount += sbp->sb_agblocks - 1;
627 do_div(icount, sbp->sb_agblocks);
628 max_metadata = icount;
629 } else {
630 max_metadata = sbp->sb_agcount;
631 }
632
633 for (index = 0; index < sbp->sb_agcount; index++) {
634 ino = XFS_AGINO_TO_INO(mp, index, agino);
635
636 if (ino > XFS_MAXINUMBER_32) {
637 pag = xfs_perag_get(mp, index);
638 pag->pagi_inodeok = 0;
639 pag->pagf_metadata = 0;
640 xfs_perag_put(pag);
641 continue;
642 }
643
644 pag = xfs_perag_get(mp, index);
645 pag->pagi_inodeok = 1;
646 maxagi++;
647 if (index < max_metadata)
648 pag->pagf_metadata = 1;
649 xfs_perag_put(pag);
650 }
651 mp->m_flags |= (XFS_MOUNT_32BITINODES |
652 XFS_MOUNT_SMALL_INUMS);
653
654 return maxagi;
655}
656
657xfs_agnumber_t
658xfs_set_inode64(struct xfs_mount *mp)
659{
660 xfs_agnumber_t index = 0;
661
662 for (index = 0; index < mp->m_sb.sb_agcount; index++) {
663 struct xfs_perag *pag;
664
665 pag = xfs_perag_get(mp, index);
666 pag->pagi_inodeok = 1;
667 pag->pagf_metadata = 0;
668 xfs_perag_put(pag);
669 }
670
671 /* There is no need for lock protection on m_flags,
672 * the rw_semaphore of the VFS superblock is locked
673 * during mount/umount/remount operations, so this is
674 * enough to avoid concurency on the m_flags field
675 */
676 mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
677 XFS_MOUNT_SMALL_INUMS);
678 return index;
679}
680
594STATIC int 681STATIC int
595xfs_blkdev_get( 682xfs_blkdev_get(
596 xfs_mount_t *mp, 683 xfs_mount_t *mp,
@@ -919,6 +1006,7 @@ xfs_fs_put_super(
919 struct xfs_mount *mp = XFS_M(sb); 1006 struct xfs_mount *mp = XFS_M(sb);
920 1007
921 xfs_filestream_unmount(mp); 1008 xfs_filestream_unmount(mp);
1009 cancel_delayed_work_sync(&mp->m_sync_work);
922 xfs_unmountfs(mp); 1010 xfs_unmountfs(mp);
923 xfs_syncd_stop(mp); 1011 xfs_syncd_stop(mp);
924 xfs_freesb(mp); 1012 xfs_freesb(mp);
@@ -953,7 +1041,7 @@ xfs_fs_sync_fs(
953 * We schedule xfssyncd now (now that the disk is 1041 * We schedule xfssyncd now (now that the disk is
954 * active) instead of later (when it might not be). 1042 * active) instead of later (when it might not be).
955 */ 1043 */
956 flush_delayed_work_sync(&mp->m_sync_work); 1044 flush_delayed_work(&mp->m_sync_work);
957 } 1045 }
958 1046
959 return 0; 1047 return 0;
@@ -1055,6 +1143,12 @@ xfs_fs_remount(
1055 case Opt_nobarrier: 1143 case Opt_nobarrier:
1056 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1144 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1057 break; 1145 break;
1146 case Opt_inode64:
1147 mp->m_maxagi = xfs_set_inode64(mp);
1148 break;
1149 case Opt_inode32:
1150 mp->m_maxagi = xfs_set_inode32(mp);
1151 break;
1058 default: 1152 default:
1059 /* 1153 /*
1060 * Logically we would return an error here to prevent 1154 * Logically we would return an error here to prevent
@@ -1505,6 +1599,11 @@ xfs_init_zones(void)
1505STATIC void 1599STATIC void
1506xfs_destroy_zones(void) 1600xfs_destroy_zones(void)
1507{ 1601{
1602 /*
1603 * Make sure all delayed rcu free are flushed before we
1604 * destroy caches.
1605 */
1606 rcu_barrier();
1508 kmem_zone_destroy(xfs_ili_zone); 1607 kmem_zone_destroy(xfs_ili_zone);
1509 kmem_zone_destroy(xfs_inode_zone); 1608 kmem_zone_destroy(xfs_inode_zone);
1510 kmem_zone_destroy(xfs_efi_zone); 1609 kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 09b0c26b2245..9de4a920ba05 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -75,6 +75,8 @@ struct block_device;
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
78 80
79extern const struct export_operations xfs_export_operations; 81extern const struct export_operations xfs_export_operations;
80extern const struct xattr_handler *xfs_xattr_handlers[]; 82extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 96548176db80..9500caf15acf 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -475,7 +475,7 @@ xfs_flush_inodes(
475 struct xfs_mount *mp = ip->i_mount; 475 struct xfs_mount *mp = ip->i_mount;
476 476
477 queue_work(xfs_syncd_wq, &mp->m_flush_work); 477 queue_work(xfs_syncd_wq, &mp->m_flush_work);
478 flush_work_sync(&mp->m_flush_work); 478 flush_work(&mp->m_flush_work);
479} 479}
480 480
481STATIC void 481STATIC void
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e5795dd6013a..7d36ccf57f93 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -37,6 +37,7 @@ struct xlog_recover;
37struct xlog_recover_item; 37struct xlog_recover_item;
38struct xfs_buf_log_format; 38struct xfs_buf_log_format;
39struct xfs_inode_log_format; 39struct xfs_inode_log_format;
40struct xfs_bmbt_irec;
40 41
41DECLARE_EVENT_CLASS(xfs_attr_list_class, 42DECLARE_EVENT_CLASS(xfs_attr_list_class,
42 TP_PROTO(struct xfs_attr_list_context *ctx), 43 TP_PROTO(struct xfs_attr_list_context *ctx),
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index bcb60542fcf1..0c7fa54f309e 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -578,9 +578,11 @@ xfs_quota_warn(
578 /* no warnings for project quotas - we just return ENOSPC later */ 578 /* no warnings for project quotas - we just return ENOSPC later */
579 if (dqp->dq_flags & XFS_DQ_PROJ) 579 if (dqp->dq_flags & XFS_DQ_PROJ)
580 return; 580 return;
581 quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, 581 quota_send_warning(make_kqid(&init_user_ns,
582 be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, 582 (dqp->dq_flags & XFS_DQ_USER) ?
583 type); 583 USRQUOTA : GRPQUOTA,
584 be32_to_cpu(dqp->q_core.d_id)),
585 mp->m_super->s_dev, type);
584} 586}
585 587
586/* 588/*